blob: bd511024bb4f0ad6e437e35500cc9f6f67355c39 [file] [log] [blame]
ager@chromium.org381abbb2009-02-25 13:23:22 +00001// Copyright 2006-2009 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000031#include "compiler.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000032#include "execution.h"
33#include "factory.h"
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +000034#include "jsregexp.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000035#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000036#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000037#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000038#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000039#include "string-stream.h"
40#include "parser.h"
41#include "regexp-macro-assembler.h"
42#include "regexp-macro-assembler-tracer.h"
43#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000044#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000045
kasperl@chromium.org71affb52009-05-26 05:44:31 +000046#if V8_TARGET_ARCH_IA32
ager@chromium.org3a37e9b2009-04-27 09:26:21 +000047#include "ia32/macro-assembler-ia32.h"
48#include "ia32/regexp-macro-assembler-ia32.h"
ager@chromium.org9085a012009-05-11 19:22:57 +000049#elif V8_TARGET_ARCH_X64
50#include "x64/macro-assembler-x64.h"
51#include "x64/regexp-macro-assembler-x64.h"
52#elif V8_TARGET_ARCH_ARM
53#include "arm/regexp-macro-assembler-arm.h"
kasperl@chromium.org2abc4502009-07-02 07:00:29 +000054#else
55#error Unsupported target architecture.
ager@chromium.orga74f0da2008-12-03 16:05:52 +000056#endif
57
58#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000059
ager@chromium.orga74f0da2008-12-03 16:05:52 +000060
kasperl@chromium.org71affb52009-05-26 05:44:31 +000061namespace v8 {
62namespace internal {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000063
64
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000065Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
66 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000067 Handle<String> flags,
68 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000069 // Ensure that the constructor function has been loaded.
70 if (!constructor->IsLoaded()) {
71 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +000072 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000073 }
74 // Call the construct code with 2 arguments.
75 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
76 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000077 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000078}
79
80
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000081static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
82 int flags = JSRegExp::NONE;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +000083 for (int i = 0; i < str->length(); i++) {
84 switch (str->Get(i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000085 case 'i':
86 flags |= JSRegExp::IGNORE_CASE;
87 break;
88 case 'g':
89 flags |= JSRegExp::GLOBAL;
90 break;
91 case 'm':
92 flags |= JSRegExp::MULTILINE;
93 break;
94 }
95 }
96 return JSRegExp::Flags(flags);
97}
98
99
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000100static inline void ThrowRegExpException(Handle<JSRegExp> re,
101 Handle<String> pattern,
102 Handle<String> error_text,
103 const char* message) {
104 Handle<JSArray> array = Factory::NewJSArray(2);
105 SetElement(array, 0, pattern);
106 SetElement(array, 1, error_text);
107 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
108 Top::Throw(*regexp_err);
109}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000110
111
ager@chromium.org8bb60582008-12-11 12:02:20 +0000112// Generic RegExp methods. Dispatches to implementation specific methods.
113
114
115class OffsetsVector {
116 public:
117 inline OffsetsVector(int num_registers)
118 : offsets_vector_length_(num_registers) {
119 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
120 vector_ = NewArray<int>(offsets_vector_length_);
121 } else {
122 vector_ = static_offsets_vector_;
123 }
124 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000125 inline ~OffsetsVector() {
126 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
127 DeleteArray(vector_);
128 vector_ = NULL;
129 }
130 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000131 inline int* vector() { return vector_; }
132 inline int length() { return offsets_vector_length_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000133
134 private:
135 int* vector_;
136 int offsets_vector_length_;
137 static const int kStaticOffsetsVectorSize = 50;
138 static int static_offsets_vector_[kStaticOffsetsVectorSize];
139};
140
141
142int OffsetsVector::static_offsets_vector_[
143 OffsetsVector::kStaticOffsetsVectorSize];
144
145
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000146Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
147 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000148 Handle<String> flag_str) {
149 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
150 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
151 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000152 LOG(RegExpCompileEvent(re, in_cache));
153
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000154 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000155 if (in_cache) {
156 re->set_data(*cached);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000157 return re;
158 }
159 FlattenString(pattern);
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000160 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000161 RegExpCompileData parse_result;
162 FlatStringReader reader(pattern);
163 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
164 // Throw an exception if we fail to parse the pattern.
165 ThrowRegExpException(re,
166 pattern,
167 parse_result.error,
168 "malformed_regexp");
169 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000170 }
171
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000172 if (parse_result.simple && !flags.is_ignore_case()) {
173 // Parse-tree is a single atom that is equal to the pattern.
174 AtomCompile(re, pattern, flags, pattern);
175 } else if (parse_result.tree->IsAtom() &&
176 !flags.is_ignore_case() &&
177 parse_result.capture_count == 0) {
178 RegExpAtom* atom = parse_result.tree->AsAtom();
179 Vector<const uc16> atom_pattern = atom->data();
180 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
181 AtomCompile(re, pattern, flags, atom_string);
182 } else {
183 IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
184 }
185 ASSERT(re->data()->IsFixedArray());
186 // Compilation succeeded so the data is set on the regexp
187 // and we can store it in the cache.
188 Handle<FixedArray> data(FixedArray::cast(re->data()));
189 CompilationCache::PutRegExp(pattern, flags, data);
190
191 return re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000192}
193
194
195Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
196 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000197 int index,
198 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000199 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000200 case JSRegExp::ATOM:
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000201 return AtomExec(regexp, subject, index, last_match_info);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000202 case JSRegExp::IRREGEXP: {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000203 Handle<Object> result =
204 IrregexpExec(regexp, subject, index, last_match_info);
ager@chromium.org6f10e412009-02-13 10:11:16 +0000205 ASSERT(!result.is_null() || Top::has_pending_exception());
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000206 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000207 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000208 default:
209 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000210 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000211 }
212}
213
214
ager@chromium.org8bb60582008-12-11 12:02:20 +0000215// RegExp Atom implementation: Simple string search using indexOf.
216
217
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000218void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
219 Handle<String> pattern,
220 JSRegExp::Flags flags,
221 Handle<String> match_pattern) {
222 Factory::SetRegExpAtomData(re,
223 JSRegExp::ATOM,
224 pattern,
225 flags,
226 match_pattern);
227}
228
229
230static void SetAtomLastCapture(FixedArray* array,
231 String* subject,
232 int from,
233 int to) {
234 NoHandleAllocation no_handles;
235 RegExpImpl::SetLastCaptureCount(array, 2);
236 RegExpImpl::SetLastSubject(array, subject);
237 RegExpImpl::SetLastInput(array, subject);
238 RegExpImpl::SetCapture(array, 0, from);
239 RegExpImpl::SetCapture(array, 1, to);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000240}
241
242
243Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
244 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000245 int index,
246 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000247 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000248
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000249 uint32_t start_index = index;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000250
ager@chromium.org7c537e22008-10-16 08:43:32 +0000251 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000252 if (value == -1) return Factory::null_value();
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000253 ASSERT(last_match_info->HasFastElements());
ager@chromium.org7c537e22008-10-16 08:43:32 +0000254
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000255 {
256 NoHandleAllocation no_handles;
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +0000257 FixedArray* array = FixedArray::cast(last_match_info->elements());
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000258 SetAtomLastCapture(array, *subject, value, value + needle->length());
259 }
260 return last_match_info;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000261}
262
263
ager@chromium.org8bb60582008-12-11 12:02:20 +0000264// Irregexp implementation.
265
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000266// Ensures that the regexp object contains a compiled version of the
267// source for either ASCII or non-ASCII strings.
268// If the compiled version doesn't already exist, it is compiled
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000269// from the source pattern.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000270// If compilation fails, an exception is thrown and this function
271// returns false.
ager@chromium.org41826e72009-03-30 13:30:57 +0000272bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000273#ifdef V8_NATIVE_REGEXP
274 if (re->DataAt(JSRegExp::code_index(is_ascii))->IsCode()) return true;
275#else // ! V8_NATIVE_REGEXP (RegExp interpreter code)
276 if (re->DataAt(JSRegExp::code_index(is_ascii))->IsByteArray()) return true;
277#endif
278 return CompileIrregexp(re, is_ascii);
279}
ager@chromium.org8bb60582008-12-11 12:02:20 +0000280
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000281
282bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000283 // Compile the RegExp.
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000284 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000285 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
286 if (entry->IsJSObject()) {
287 // If it's a JSObject, a previous compilation failed and threw this object.
288 // Re-throw the object without trying again.
289 Top::Throw(entry);
290 return false;
291 }
292 ASSERT(entry->IsTheHole());
ager@chromium.org8bb60582008-12-11 12:02:20 +0000293
294 JSRegExp::Flags flags = re->GetFlags();
295
296 Handle<String> pattern(re->Pattern());
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000297 if (!pattern->IsFlat()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000298 FlattenString(pattern);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000299 }
300
301 RegExpCompileData compile_data;
302 FlatStringReader reader(pattern);
303 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
304 // Throw an exception if we fail to parse the pattern.
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000305 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000306 ThrowRegExpException(re,
307 pattern,
308 compile_data.error,
309 "malformed_regexp");
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000310 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000311 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000312 RegExpEngine::CompilationResult result =
ager@chromium.org8bb60582008-12-11 12:02:20 +0000313 RegExpEngine::Compile(&compile_data,
314 flags.is_ignore_case(),
315 flags.is_multiline(),
316 pattern,
317 is_ascii);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000318 if (result.error_message != NULL) {
319 // Unable to compile regexp.
320 Handle<JSArray> array = Factory::NewJSArray(2);
321 SetElement(array, 0, pattern);
322 SetElement(array,
323 1,
324 Factory::NewStringFromUtf8(CStrVector(result.error_message)));
325 Handle<Object> regexp_err =
326 Factory::NewSyntaxError("malformed_regexp", array);
327 Top::Throw(*regexp_err);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000328 re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000329 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000330 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000331
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000332 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
333 data->set(JSRegExp::code_index(is_ascii), result.code);
334 int register_max = IrregexpMaxRegisterCount(*data);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000335 if (result.num_registers > register_max) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000336 SetIrregexpMaxRegisterCount(*data, result.num_registers);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000337 }
338
339 return true;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000340}
341
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000342
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000343int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
344 return Smi::cast(
345 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000346}
347
348
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000349void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
350 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000351}
352
353
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000354int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
355 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000356}
357
358
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000359int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
360 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000361}
362
363
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000364ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000365 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000366}
367
368
369Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000370 return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000371}
372
373
374void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
375 Handle<String> pattern,
376 JSRegExp::Flags flags,
377 int capture_count) {
378 // Initialize compiled code entries to null.
379 Factory::SetRegExpIrregexpData(re,
380 JSRegExp::IRREGEXP,
381 pattern,
382 flags,
383 capture_count);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000384}
385
386
ager@chromium.org41826e72009-03-30 13:30:57 +0000387Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000388 Handle<String> subject,
ager@chromium.org41826e72009-03-30 13:30:57 +0000389 int previous_index,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000390 Handle<JSArray> last_match_info) {
ager@chromium.org41826e72009-03-30 13:30:57 +0000391 ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000392
ager@chromium.org8bb60582008-12-11 12:02:20 +0000393 // Prepare space for the return values.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000394 int number_of_capture_registers =
ager@chromium.org41826e72009-03-30 13:30:57 +0000395 (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000396
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000397#ifndef V8_NATIVE_REGEXP
ager@chromium.org8bb60582008-12-11 12:02:20 +0000398#ifdef DEBUG
399 if (FLAG_trace_regexp_bytecodes) {
ager@chromium.org41826e72009-03-30 13:30:57 +0000400 String* pattern = jsregexp->Pattern();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000401 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
402 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
403 }
404#endif
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000405#endif
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000406
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000407 if (!subject->IsFlat()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000408 FlattenString(subject);
409 }
410
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000411 last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
412
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000413 Handle<FixedArray> array;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000414
ager@chromium.org41826e72009-03-30 13:30:57 +0000415 // Dispatch to the correct RegExp implementation.
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000416 Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000417#ifdef V8_NATIVE_REGEXP
ager@chromium.org9085a012009-05-11 19:22:57 +0000418#if V8_TARGET_ARCH_IA32
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000419 OffsetsVector captures(number_of_capture_registers);
420 int* captures_vector = captures.vector();
421 RegExpMacroAssemblerIA32::Result res;
422 do {
ager@chromium.org5ec48922009-05-05 07:25:34 +0000423 bool is_ascii = subject->IsAsciiRepresentation();
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000424 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
425 return Handle<Object>::null();
426 }
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000427 Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
428 res = RegExpMacroAssemblerIA32::Match(code,
429 subject,
430 captures_vector,
431 captures.length(),
432 previous_index);
433 // If result is RETRY, the string have changed representation, and we
434 // must restart from scratch.
435 } while (res == RegExpMacroAssemblerIA32::RETRY);
436 if (res == RegExpMacroAssemblerIA32::EXCEPTION) {
437 ASSERT(Top::has_pending_exception());
438 return Handle<Object>::null();
439 }
440 ASSERT(res == RegExpMacroAssemblerIA32::SUCCESS
441 || res == RegExpMacroAssemblerIA32::FAILURE);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000442
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000443 if (res != RegExpMacroAssemblerIA32::SUCCESS) return Factory::null_value();
ager@chromium.org5aa501c2009-06-23 07:57:28 +0000444
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +0000445 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000446 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
447 // The captures come in (start, end+1) pairs.
448 for (int i = 0; i < number_of_capture_registers; i += 2) {
449 SetCapture(*array, i, captures_vector[i]);
450 SetCapture(*array, i + 1, captures_vector[i + 1]);
451 }
452#else // !V8_TARGET_ARCH_IA32
453 UNREACHABLE();
454#endif // V8_TARGET_ARCH_IA32
455#else // !V8_NATIVE_REGEXP
456 bool is_ascii = subject->IsAsciiRepresentation();
457 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
458 return Handle<Object>::null();
459 }
460 // Now that we have done EnsureCompiledIrregexp we can get the number of
461 // registers.
462 int number_of_registers =
463 IrregexpNumberOfRegisters(FixedArray::cast(jsregexp->data()));
464 OffsetsVector registers(number_of_registers);
465 int* register_vector = registers.vector();
466 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
467 register_vector[i] = -1;
468 }
469 Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
470
471 if (!IrregexpInterpreter::Match(byte_codes,
472 subject,
473 register_vector,
474 previous_index)) {
475 return Factory::null_value();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000476 }
477
sgjesse@chromium.org0b6db592009-07-30 14:48:31 +0000478 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
kasperl@chromium.org68ac0092009-07-09 06:00:35 +0000479 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
480 // The captures come in (start, end+1) pairs.
481 for (int i = 0; i < number_of_capture_registers; i += 2) {
482 SetCapture(*array, i, register_vector[i]);
483 SetCapture(*array, i + 1, register_vector[i + 1]);
484 }
485#endif // V8_NATIVE_REGEXP
486
487 SetLastCaptureCount(*array, number_of_capture_registers);
488 SetLastSubject(*array, *subject);
489 SetLastInput(*array, *subject);
ager@chromium.org5aa501c2009-06-23 07:57:28 +0000490
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000491 return last_match_info;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000492}
493
494
495// -------------------------------------------------------------------
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000496// Implementation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000497//
498// The Irregexp regular expression engine is intended to be a complete
499// implementation of ECMAScript regular expressions. It generates either
500// bytecodes or native code.
501
502// The Irregexp regexp engine is structured in three steps.
503// 1) The parser generates an abstract syntax tree. See ast.cc.
504// 2) From the AST a node network is created. The nodes are all
505// subclasses of RegExpNode. The nodes represent states when
506// executing a regular expression. Several optimizations are
507// performed on the node network.
508// 3) From the nodes we generate either byte codes or native code
509// that can actually execute the regular expression (perform
510// the search). The code generation step is described in more
511// detail below.
512
513// Code generation.
514//
515// The nodes are divided into four main categories.
516// * Choice nodes
517// These represent places where the regular expression can
518// match in more than one way. For example on entry to an
519// alternation (foo|bar) or a repetition (*, +, ? or {}).
520// * Action nodes
521// These represent places where some action should be
522// performed. Examples include recording the current position
523// in the input string to a register (in order to implement
524// captures) or other actions on register for example in order
525// to implement the counters needed for {} repetitions.
526// * Matching nodes
527// These attempt to match some element part of the input string.
528// Examples of elements include character classes, plain strings
529// or back references.
530// * End nodes
531// These are used to implement the actions required on finding
532// a successful match or failing to find a match.
533//
534// The code generated (whether as byte codes or native code) maintains
535// some state as it runs. This consists of the following elements:
536//
537// * The capture registers. Used for string captures.
538// * Other registers. Used for counters etc.
539// * The current position.
540// * The stack of backtracking information. Used when a matching node
541// fails to find a match and needs to try an alternative.
542//
543// Conceptual regular expression execution model:
544//
545// There is a simple conceptual model of regular expression execution
546// which will be presented first. The actual code generated is a more
547// efficient simulation of the simple conceptual model:
548//
549// * Choice nodes are implemented as follows:
550// For each choice except the last {
551// push current position
552// push backtrack code location
553// <generate code to test for choice>
554// backtrack code location:
555// pop current position
556// }
557// <generate code to test for last choice>
558//
559// * Actions nodes are generated as follows
560// <push affected registers on backtrack stack>
561// <generate code to perform action>
562// push backtrack code location
563// <generate code to test for following nodes>
564// backtrack code location:
565// <pop affected registers to restore their state>
566// <pop backtrack location from stack and go to it>
567//
568// * Matching nodes are generated as follows:
569// if input string matches at current position
570// update current position
571// <generate code to test for following nodes>
572// else
573// <pop backtrack location from stack and go to it>
574//
575// Thus it can be seen that the current position is saved and restored
576// by the choice nodes, whereas the registers are saved and restored by
577// by the action nodes that manipulate them.
578//
579// The other interesting aspect of this model is that nodes are generated
580// at the point where they are needed by a recursive call to Emit(). If
581// the node has already been code generated then the Emit() call will
582// generate a jump to the previously generated code instead. In order to
583// limit recursion it is possible for the Emit() function to put the node
584// on a work list for later generation and instead generate a jump. The
585// destination of the jump is resolved later when the code is generated.
586//
587// Actual regular expression code generation.
588//
589// Code generation is actually more complicated than the above. In order
590// to improve the efficiency of the generated code some optimizations are
591// performed
592//
593// * Choice nodes have 1-character lookahead.
594// A choice node looks at the following character and eliminates some of
595// the choices immediately based on that character. This is not yet
596// implemented.
597// * Simple greedy loops store reduced backtracking information.
598// A quantifier like /.*foo/m will greedily match the whole input. It will
599// then need to backtrack to a point where it can match "foo". The naive
600// implementation of this would push each character position onto the
601// backtracking stack, then pop them off one by one. This would use space
602// proportional to the length of the input string. However since the "."
603// can only match in one way and always has a constant length (in this case
604// of 1) it suffices to store the current position on the top of the stack
605// once. Matching now becomes merely incrementing the current position and
606// backtracking becomes decrementing the current position and checking the
607// result against the stored current position. This is faster and saves
608// space.
609// * The current state is virtualized.
610// This is used to defer expensive operations until it is clear that they
611// are needed and to generate code for a node more than once, allowing
612// specialized an efficient versions of the code to be created. This is
613// explained in the section below.
614//
615// Execution state virtualization.
616//
617// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +0000618// manipulation in an object called the Trace. The Trace object can record a
619// current position offset, an optional backtrack code location on the top of
620// the virtualized backtrack stack and some register changes. When a node is
621// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +0000622// will emit code to bring the actual state into line with the virtual state.
623// Avoiding flushing the state can postpone some work (eg updates of capture
624// registers). Postponing work can save time when executing the regular
625// expression since it may be found that the work never has to be done as a
626// failure to match can occur. In addition it is much faster to jump to a
627// known backtrack code location than it is to pop an unknown backtrack
628// location from the stack and jump there.
629//
ager@chromium.org32912102009-01-16 10:38:43 +0000630// The virtual state found in the Trace affects code generation. For example
631// the virtual state contains the difference between the actual current
632// position and the virtual current position, and matching code needs to use
633// this offset to attempt a match in the correct location of the input
634// string. Therefore code generated for a non-trivial trace is specialized
635// to that trace. The code generator therefore has the ability to generate
636// code for each node several times. In order to limit the size of the
637// generated code there is an arbitrary limit on how many specialized sets of
638// code may be generated for a given node. If the limit is reached, the
639// trace is flushed and a generic version of the code for a node is emitted.
640// This is subsequently used for that node. The code emitted for non-generic
641// trace is not recorded in the node and so it cannot currently be reused in
642// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000643
644
645void RegExpTree::AppendToText(RegExpText* text) {
646 UNREACHABLE();
647}
648
649
650void RegExpAtom::AppendToText(RegExpText* text) {
651 text->AddElement(TextElement::Atom(this));
652}
653
654
655void RegExpCharacterClass::AppendToText(RegExpText* text) {
656 text->AddElement(TextElement::CharClass(this));
657}
658
659
660void RegExpText::AppendToText(RegExpText* text) {
661 for (int i = 0; i < elements()->length(); i++)
662 text->AddElement(elements()->at(i));
663}
664
665
666TextElement TextElement::Atom(RegExpAtom* atom) {
667 TextElement result = TextElement(ATOM);
668 result.data.u_atom = atom;
669 return result;
670}
671
672
673TextElement TextElement::CharClass(
674 RegExpCharacterClass* char_class) {
675 TextElement result = TextElement(CHAR_CLASS);
676 result.data.u_char_class = char_class;
677 return result;
678}
679
680
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000681int TextElement::length() {
682 if (type == ATOM) {
683 return data.u_atom->length();
684 } else {
685 ASSERT(type == CHAR_CLASS);
686 return 1;
687 }
688}
689
690
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000691DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
692 if (table_ == NULL) {
693 table_ = new DispatchTable();
694 DispatchTableConstructor cons(table_, ignore_case);
695 cons.BuildTable(this);
696 }
697 return table_;
698}
699
700
701class RegExpCompiler {
702 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +0000703 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000704
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000705 int AllocateRegister() {
706 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
707 reg_exp_too_big_ = true;
708 return next_register_;
709 }
710 return next_register_++;
711 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000712
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000713 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
714 RegExpNode* start,
715 int capture_count,
716 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000717
718 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
719
720 static const int kImplementationOffset = 0;
721 static const int kNumberOfRegistersOffset = 0;
722 static const int kCodeOffset = 1;
723
724 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
725 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000726
727 static const int kMaxRecursion = 100;
728 inline int recursion_depth() { return recursion_depth_; }
729 inline void IncrementRecursionDepth() { recursion_depth_++; }
730 inline void DecrementRecursionDepth() { recursion_depth_--; }
731
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000732 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
733
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000734 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000735 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000736
ager@chromium.org32912102009-01-16 10:38:43 +0000737 static const int kNoRegister = -1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000738 private:
739 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000740 int next_register_;
741 List<RegExpNode*>* work_list_;
742 int recursion_depth_;
743 RegExpMacroAssembler* macro_assembler_;
744 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000745 bool ascii_;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000746 bool reg_exp_too_big_;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000747};
748
749
750class RecursionCheck {
751 public:
752 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
753 compiler->IncrementRecursionDepth();
754 }
755 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
756 private:
757 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000758};
759
760
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000761static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
762 return RegExpEngine::CompilationResult("RegExp too big");
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000763}
764
765
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000766// Attempts to compile the regexp using an Irregexp code generator. Returns
767// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000768RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000769 : next_register_(2 * (capture_count + 1)),
770 work_list_(NULL),
771 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000772 ignore_case_(ignore_case),
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000773 ascii_(ascii),
774 reg_exp_too_big_(false) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000775 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000776 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000777}
778
779
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000780RegExpEngine::CompilationResult RegExpCompiler::Assemble(
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000781 RegExpMacroAssembler* macro_assembler,
782 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000783 int capture_count,
784 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000785#ifdef DEBUG
786 if (FLAG_trace_regexp_assembler)
787 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
788 else
789#endif
790 macro_assembler_ = macro_assembler;
791 List <RegExpNode*> work_list(0);
792 work_list_ = &work_list;
793 Label fail;
iposva@chromium.org245aa852009-02-10 00:49:54 +0000794 macro_assembler_->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +0000795 Trace new_trace;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000796 start->Emit(this, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000797 macro_assembler_->Bind(&fail);
798 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000799 while (!work_list.is_empty()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000800 work_list.RemoveLast()->Emit(this, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000801 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000802 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
803
ager@chromium.org8bb60582008-12-11 12:02:20 +0000804 Handle<Object> code = macro_assembler_->GetCode(pattern);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000805
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000806 work_list_ = NULL;
807#ifdef DEBUG
808 if (FLAG_trace_regexp_assembler) {
809 delete macro_assembler_;
810 }
811#endif
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000812 return RegExpEngine::CompilationResult(*code, next_register_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000813}
814
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000815
ager@chromium.org32912102009-01-16 10:38:43 +0000816bool Trace::DeferredAction::Mentions(int that) {
817 if (type() == ActionNode::CLEAR_CAPTURES) {
818 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
819 return range.Contains(that);
820 } else {
821 return reg() == that;
822 }
823}
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000824
ager@chromium.org32912102009-01-16 10:38:43 +0000825
826bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000827 for (DeferredAction* action = actions_;
828 action != NULL;
829 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000830 if (action->Mentions(reg))
831 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000832 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000833 return false;
834}
835
836
ager@chromium.org32912102009-01-16 10:38:43 +0000837bool Trace::GetStoredPosition(int reg, int* cp_offset) {
838 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000839 for (DeferredAction* action = actions_;
840 action != NULL;
841 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000842 if (action->Mentions(reg)) {
843 if (action->type() == ActionNode::STORE_POSITION) {
844 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
845 return true;
846 } else {
847 return false;
848 }
849 }
850 }
851 return false;
852}
853
854
855int Trace::FindAffectedRegisters(OutSet* affected_registers) {
856 int max_register = RegExpCompiler::kNoRegister;
857 for (DeferredAction* action = actions_;
858 action != NULL;
859 action = action->next()) {
860 if (action->type() == ActionNode::CLEAR_CAPTURES) {
861 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
862 for (int i = range.from(); i <= range.to(); i++)
863 affected_registers->Set(i);
864 if (range.to() > max_register) max_register = range.to();
865 } else {
866 affected_registers->Set(action->reg());
867 if (action->reg() > max_register) max_register = action->reg();
868 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000869 }
870 return max_register;
871}
872
873
ager@chromium.org32912102009-01-16 10:38:43 +0000874void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
875 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000876 OutSet& registers_to_pop,
877 OutSet& registers_to_clear) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000878 for (int reg = max_register; reg >= 0; reg--) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000879 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
880 else if (registers_to_clear.Get(reg)) {
881 int clear_to = reg;
882 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
883 reg--;
884 }
885 assembler->ClearRegisters(reg, clear_to);
886 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000887 }
888}
889
890
ager@chromium.org32912102009-01-16 10:38:43 +0000891void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
892 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000893 OutSet& affected_registers,
894 OutSet* registers_to_pop,
895 OutSet* registers_to_clear) {
896 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
897 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
898
ager@chromium.org5aa501c2009-06-23 07:57:28 +0000899 // Count pushes performed to force a stack limit check occasionally.
900 int pushes = 0;
901
ager@chromium.org8bb60582008-12-11 12:02:20 +0000902 for (int reg = 0; reg <= max_register; reg++) {
903 if (!affected_registers.Get(reg)) {
904 continue;
905 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000906
907 // The chronologically first deferred action in the trace
908 // is used to infer the action needed to restore a register
909 // to its previous state (or not, if it's safe to ignore it).
910 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
911 DeferredActionUndoType undo_action = IGNORE;
912
ager@chromium.org8bb60582008-12-11 12:02:20 +0000913 int value = 0;
914 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +0000915 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000916 int store_position = -1;
917 // This is a little tricky because we are scanning the actions in reverse
918 // historical order (newest first).
919 for (DeferredAction* action = actions_;
920 action != NULL;
921 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000922 if (action->Mentions(reg)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000923 switch (action->type()) {
924 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +0000925 Trace::DeferredSetRegister* psr =
926 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000927 if (!absolute) {
928 value += psr->value();
929 absolute = true;
930 }
931 // SET_REGISTER is currently only used for newly introduced loop
932 // counters. They can have a significant previous value if they
933 // occour in a loop. TODO(lrn): Propagate this information, so
934 // we can set undo_action to IGNORE if we know there is no value to
935 // restore.
936 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000937 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +0000938 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000939 break;
940 }
941 case ActionNode::INCREMENT_REGISTER:
942 if (!absolute) {
943 value++;
944 }
945 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +0000946 ASSERT(!clear);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000947 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000948 break;
949 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +0000950 Trace::DeferredCapture* pc =
951 static_cast<Trace::DeferredCapture*>(action);
952 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000953 store_position = pc->cp_offset();
954 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000955
956 // For captures we know that stores and clears alternate.
957 // Other register, are never cleared, and if the occur
958 // inside a loop, they might be assigned more than once.
959 if (reg <= 1) {
960 // Registers zero and one, aka "capture zero", is
961 // always set correctly if we succeed. There is no
962 // need to undo a setting on backtrack, because we
963 // will set it again or fail.
964 undo_action = IGNORE;
965 } else {
966 undo_action = pc->is_capture() ? CLEAR : RESTORE;
967 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000968 ASSERT(!absolute);
969 ASSERT_EQ(value, 0);
970 break;
971 }
ager@chromium.org32912102009-01-16 10:38:43 +0000972 case ActionNode::CLEAR_CAPTURES: {
973 // Since we're scanning in reverse order, if we've already
974 // set the position we have to ignore historically earlier
975 // clearing operations.
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000976 if (store_position == -1) {
ager@chromium.org32912102009-01-16 10:38:43 +0000977 clear = true;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000978 }
979 undo_action = RESTORE;
ager@chromium.org32912102009-01-16 10:38:43 +0000980 ASSERT(!absolute);
981 ASSERT_EQ(value, 0);
982 break;
983 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000984 default:
985 UNREACHABLE();
986 break;
987 }
988 }
989 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000990 // Prepare for the undo-action (e.g., push if it's going to be popped).
991 if (undo_action == RESTORE) {
992 pushes++;
993 RegExpMacroAssembler::StackCheckFlag stack_check =
994 RegExpMacroAssembler::kNoStackLimitCheck;
995 if (pushes == push_limit) {
996 stack_check = RegExpMacroAssembler::kCheckStackLimit;
997 pushes = 0;
998 }
999
1000 assembler->PushRegister(reg, stack_check);
1001 registers_to_pop->Set(reg);
1002 } else if (undo_action == CLEAR) {
1003 registers_to_clear->Set(reg);
1004 }
1005 // Perform the chronologically last action (or accumulated increment)
1006 // for the register.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001007 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001008 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001009 } else if (clear) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001010 assembler->ClearRegisters(reg, reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001011 } else if (absolute) {
1012 assembler->SetRegister(reg, value);
1013 } else if (value != 0) {
1014 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001015 }
1016 }
1017}
1018
1019
ager@chromium.org8bb60582008-12-11 12:02:20 +00001020// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001021// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001022// generate generic code.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001023void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001024 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001025
iposva@chromium.org245aa852009-02-10 00:49:54 +00001026 ASSERT(!is_trivial());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001027
1028 if (actions_ == NULL && backtrack() == NULL) {
1029 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001030 // a normal situation. We may also have to forget some information gained
1031 // through a quick check that was already performed.
1032 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001033 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001034 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001035 successor->Emit(compiler, &new_state);
1036 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001037 }
1038
1039 // Generate deferred actions here along with code to undo them again.
1040 OutSet affected_registers;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001041
ager@chromium.org381abbb2009-02-25 13:23:22 +00001042 if (backtrack() != NULL) {
1043 // Here we have a concrete backtrack location. These are set up by choice
1044 // nodes and so they indicate that we have a deferred save of the current
1045 // position which we may need to emit here.
1046 assembler->PushCurrentPosition();
1047 }
1048
ager@chromium.org8bb60582008-12-11 12:02:20 +00001049 int max_register = FindAffectedRegisters(&affected_registers);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001050 OutSet registers_to_pop;
1051 OutSet registers_to_clear;
1052 PerformDeferredActions(assembler,
1053 max_register,
1054 affected_registers,
1055 &registers_to_pop,
1056 &registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001057 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001058 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001059 }
1060
1061 // Create a new trivial state and generate the node with that.
1062 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001063 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001064 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001065 successor->Emit(compiler, &new_state);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001066
1067 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001068 assembler->Bind(&undo);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001069 RestoreAffectedRegisters(assembler,
1070 max_register,
1071 registers_to_pop,
1072 registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001073 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001074 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001075 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001076 assembler->PopCurrentPosition();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001077 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001078 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001079}
1080
1081
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001082void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001083 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001084
1085 // Omit flushing the trace. We discard the entire stack frame anyway.
1086
ager@chromium.org8bb60582008-12-11 12:02:20 +00001087 if (!label()->is_bound()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001088 // We are completely independent of the trace, since we ignore it,
1089 // so this code can be used as the generic version.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001090 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001091 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001092
1093 // Throw away everything on the backtrack stack since the start
1094 // of the negative submatch and restore the character position.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001095 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1096 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001097 if (clear_capture_count_ > 0) {
1098 // Clear any captures that might have been performed during the success
1099 // of the body of the negative look-ahead.
1100 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1101 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1102 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001103 // Now that we have unwound the stack we find at the top of the stack the
1104 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001105 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001106}
1107
1108
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001109void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00001110 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001111 trace->Flush(compiler, this);
1112 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001113 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001114 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001115 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001116 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001117 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001118 switch (action_) {
1119 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001120 assembler->Succeed();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001121 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001122 case BACKTRACK:
ager@chromium.org32912102009-01-16 10:38:43 +00001123 assembler->GoTo(trace->backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001124 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001125 case NEGATIVE_SUBMATCH_SUCCESS:
1126 // This case is handled in a different virtual method.
1127 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001128 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001129 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001130}
1131
1132
1133void GuardedAlternative::AddGuard(Guard* guard) {
1134 if (guards_ == NULL)
1135 guards_ = new ZoneList<Guard*>(1);
1136 guards_->Add(guard);
1137}
1138
1139
ager@chromium.org8bb60582008-12-11 12:02:20 +00001140ActionNode* ActionNode::SetRegister(int reg,
1141 int val,
1142 RegExpNode* on_success) {
1143 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001144 result->data_.u_store_register.reg = reg;
1145 result->data_.u_store_register.value = val;
1146 return result;
1147}
1148
1149
1150ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1151 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1152 result->data_.u_increment_register.reg = reg;
1153 return result;
1154}
1155
1156
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001157ActionNode* ActionNode::StorePosition(int reg,
1158 bool is_capture,
1159 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001160 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1161 result->data_.u_position_register.reg = reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001162 result->data_.u_position_register.is_capture = is_capture;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001163 return result;
1164}
1165
1166
ager@chromium.org32912102009-01-16 10:38:43 +00001167ActionNode* ActionNode::ClearCaptures(Interval range,
1168 RegExpNode* on_success) {
1169 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1170 result->data_.u_clear_captures.range_from = range.from();
1171 result->data_.u_clear_captures.range_to = range.to();
1172 return result;
1173}
1174
1175
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001176ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1177 int position_reg,
1178 RegExpNode* on_success) {
1179 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1180 result->data_.u_submatch.stack_pointer_register = stack_reg;
1181 result->data_.u_submatch.current_position_register = position_reg;
1182 return result;
1183}
1184
1185
ager@chromium.org8bb60582008-12-11 12:02:20 +00001186ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1187 int position_reg,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001188 int clear_register_count,
1189 int clear_register_from,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001190 RegExpNode* on_success) {
1191 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001192 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001193 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001194 result->data_.u_submatch.clear_register_count = clear_register_count;
1195 result->data_.u_submatch.clear_register_from = clear_register_from;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001196 return result;
1197}
1198
1199
ager@chromium.org32912102009-01-16 10:38:43 +00001200ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1201 int repetition_register,
1202 int repetition_limit,
1203 RegExpNode* on_success) {
1204 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1205 result->data_.u_empty_match_check.start_register = start_register;
1206 result->data_.u_empty_match_check.repetition_register = repetition_register;
1207 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1208 return result;
1209}
1210
1211
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001212#define DEFINE_ACCEPT(Type) \
1213 void Type##Node::Accept(NodeVisitor* visitor) { \
1214 visitor->Visit##Type(this); \
1215 }
1216FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1217#undef DEFINE_ACCEPT
1218
1219
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001220void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1221 visitor->VisitLoopChoice(this);
1222}
1223
1224
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001225// -------------------------------------------------------------------
1226// Emit code.
1227
1228
1229void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1230 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001231 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001232 switch (guard->op()) {
1233 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001234 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001235 macro_assembler->IfRegisterGE(guard->reg(),
1236 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001237 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001238 break;
1239 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001240 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001241 macro_assembler->IfRegisterLT(guard->reg(),
1242 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001243 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001244 break;
1245 }
1246}
1247
1248
1249static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1250static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1251
1252
ager@chromium.org381abbb2009-02-25 13:23:22 +00001253// Returns the number of characters in the equivalence class, omitting those
1254// that cannot occur in the source string because it is ASCII.
1255static int GetCaseIndependentLetters(uc16 character,
1256 bool ascii_subject,
1257 unibrow::uchar* letters) {
1258 int length = uncanonicalize.get(character, '\0', letters);
1259 // Unibrow returns 0 or 1 for characters where case independependence is
1260 // trivial.
1261 if (length == 0) {
1262 letters[0] = character;
1263 length = 1;
1264 }
1265 if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1266 return length;
1267 }
1268 // The standard requires that non-ASCII characters cannot have ASCII
1269 // character codes in their equivalence class.
1270 return 0;
1271}
1272
1273
1274static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
1275 uc16 c,
1276 Label* on_failure,
1277 int cp_offset,
1278 bool check,
1279 bool preloaded) {
1280 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1281 bool bound_checked = false;
1282 if (!preloaded) {
1283 assembler->LoadCurrentCharacter(
1284 cp_offset,
1285 on_failure,
1286 check);
1287 bound_checked = true;
1288 }
1289 assembler->CheckNotCharacter(c, on_failure);
1290 return bound_checked;
1291}
1292
1293
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001294// Only emits non-letters (things that don't have case). Only used for case
1295// independent matches.
ager@chromium.org381abbb2009-02-25 13:23:22 +00001296static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
1297 uc16 c,
1298 Label* on_failure,
1299 int cp_offset,
1300 bool check,
1301 bool preloaded) {
1302 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1303 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001304 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001305 int length = GetCaseIndependentLetters(c, ascii, chars);
1306 if (length < 1) {
1307 // This can't match. Must be an ASCII subject and a non-ASCII character.
1308 // We do not need to do anything since the ASCII pass already handled this.
1309 return false; // Bounds not checked.
1310 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001311 bool checked = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001312 // We handle the length > 1 case in a later pass.
1313 if (length == 1) {
1314 if (ascii && c > String::kMaxAsciiCharCodeU) {
1315 // Can't match - see above.
1316 return false; // Bounds not checked.
1317 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001318 if (!preloaded) {
1319 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1320 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001321 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001322 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001323 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001324 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001325}
1326
1327
1328static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001329 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001330 uc16 c1,
1331 uc16 c2,
1332 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001333 uc16 char_mask;
1334 if (ascii) {
1335 char_mask = String::kMaxAsciiCharCode;
1336 } else {
1337 char_mask = String::kMaxUC16CharCode;
1338 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001339 uc16 exor = c1 ^ c2;
1340 // Check whether exor has only one bit set.
1341 if (((exor - 1) & exor) == 0) {
1342 // If c1 and c2 differ only by one bit.
1343 // Ecma262UnCanonicalize always gives the highest number last.
1344 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001345 uc16 mask = char_mask ^ exor;
1346 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001347 return true;
1348 }
1349 ASSERT(c2 > c1);
1350 uc16 diff = c2 - c1;
1351 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1352 // If the characters differ by 2^n but don't differ by one bit then
1353 // subtract the difference from the found character, then do the or
1354 // trick. We avoid the theoretical case where negative numbers are
1355 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001356 uc16 mask = char_mask ^ diff;
1357 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1358 diff,
1359 mask,
1360 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001361 return true;
1362 }
1363 return false;
1364}
1365
1366
ager@chromium.org381abbb2009-02-25 13:23:22 +00001367typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
1368 uc16 c,
1369 Label* on_failure,
1370 int cp_offset,
1371 bool check,
1372 bool preloaded);
1373
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001374// Only emits letters (things that have case). Only used for case independent
1375// matches.
ager@chromium.org381abbb2009-02-25 13:23:22 +00001376static inline bool EmitAtomLetter(RegExpCompiler* compiler,
1377 uc16 c,
1378 Label* on_failure,
1379 int cp_offset,
1380 bool check,
1381 bool preloaded) {
1382 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1383 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001384 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001385 int length = GetCaseIndependentLetters(c, ascii, chars);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001386 if (length <= 1) return false;
1387 // We may not need to check against the end of the input string
1388 // if this character lies before a character that matched.
1389 if (!preloaded) {
1390 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001391 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001392 Label ok;
1393 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1394 switch (length) {
1395 case 2: {
1396 if (ShortCutEmitCharacterPair(macro_assembler,
1397 ascii,
1398 chars[0],
1399 chars[1],
1400 on_failure)) {
1401 } else {
1402 macro_assembler->CheckCharacter(chars[0], &ok);
1403 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1404 macro_assembler->Bind(&ok);
1405 }
1406 break;
1407 }
1408 case 4:
1409 macro_assembler->CheckCharacter(chars[3], &ok);
1410 // Fall through!
1411 case 3:
1412 macro_assembler->CheckCharacter(chars[0], &ok);
1413 macro_assembler->CheckCharacter(chars[1], &ok);
1414 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1415 macro_assembler->Bind(&ok);
1416 break;
1417 default:
1418 UNREACHABLE();
1419 break;
1420 }
1421 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001422}
1423
1424
1425static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1426 RegExpCharacterClass* cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001427 bool ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001428 Label* on_failure,
1429 int cp_offset,
1430 bool check_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001431 bool preloaded) {
1432 if (cc->is_standard() &&
1433 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1434 cp_offset,
1435 check_offset,
1436 on_failure)) {
1437 return;
1438 }
1439
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001440 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001441 int max_char;
1442 if (ascii) {
1443 max_char = String::kMaxAsciiCharCode;
1444 } else {
1445 max_char = String::kMaxUC16CharCode;
1446 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001447
1448 Label success;
1449
1450 Label* char_is_in_class =
1451 cc->is_negated() ? on_failure : &success;
1452
1453 int range_count = ranges->length();
1454
ager@chromium.org8bb60582008-12-11 12:02:20 +00001455 int last_valid_range = range_count - 1;
1456 while (last_valid_range >= 0) {
1457 CharacterRange& range = ranges->at(last_valid_range);
1458 if (range.from() <= max_char) {
1459 break;
1460 }
1461 last_valid_range--;
1462 }
1463
1464 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001465 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001466 // TODO(plesner): We can remove this when the node level does our
1467 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001468 macro_assembler->GoTo(on_failure);
1469 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001470 if (check_offset) {
1471 macro_assembler->CheckPosition(cp_offset, on_failure);
1472 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001473 return;
1474 }
1475
ager@chromium.org8bb60582008-12-11 12:02:20 +00001476 if (last_valid_range == 0 &&
1477 !cc->is_negated() &&
1478 ranges->at(0).IsEverything(max_char)) {
1479 // This is a common case hit by non-anchored expressions.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001480 if (check_offset) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001481 macro_assembler->CheckPosition(cp_offset, on_failure);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001482 }
1483 return;
1484 }
1485
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001486 if (!preloaded) {
1487 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001488 }
1489
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001490 for (int i = 0; i < last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001491 CharacterRange& range = ranges->at(i);
1492 Label next_range;
1493 uc16 from = range.from();
1494 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001495 if (from > max_char) {
1496 continue;
1497 }
1498 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001499 if (to == from) {
1500 macro_assembler->CheckCharacter(to, char_is_in_class);
1501 } else {
1502 if (from != 0) {
1503 macro_assembler->CheckCharacterLT(from, &next_range);
1504 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001505 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001506 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1507 } else {
1508 macro_assembler->GoTo(char_is_in_class);
1509 }
1510 }
1511 macro_assembler->Bind(&next_range);
1512 }
1513
ager@chromium.org8bb60582008-12-11 12:02:20 +00001514 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001515 uc16 from = range.from();
1516 uc16 to = range.to();
1517
ager@chromium.org8bb60582008-12-11 12:02:20 +00001518 if (to > max_char) to = max_char;
1519 ASSERT(to >= from);
1520
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001521 if (to == from) {
1522 if (cc->is_negated()) {
1523 macro_assembler->CheckCharacter(to, on_failure);
1524 } else {
1525 macro_assembler->CheckNotCharacter(to, on_failure);
1526 }
1527 } else {
1528 if (from != 0) {
1529 if (cc->is_negated()) {
1530 macro_assembler->CheckCharacterLT(from, &success);
1531 } else {
1532 macro_assembler->CheckCharacterLT(from, on_failure);
1533 }
1534 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001535 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001536 if (cc->is_negated()) {
1537 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1538 } else {
1539 macro_assembler->CheckCharacterGT(to, on_failure);
1540 }
1541 } else {
1542 if (cc->is_negated()) {
1543 macro_assembler->GoTo(on_failure);
1544 }
1545 }
1546 }
1547 macro_assembler->Bind(&success);
1548}
1549
1550
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001551RegExpNode::~RegExpNode() {
1552}
1553
1554
ager@chromium.org8bb60582008-12-11 12:02:20 +00001555RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001556 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001557 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00001558 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001559 return CONTINUE;
1560 }
1561
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001562 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00001563 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001564 if (label_.is_bound()) {
1565 // We are being asked to generate a generic version, but that's already
1566 // been done so just go to it.
1567 macro_assembler->GoTo(&label_);
1568 return DONE;
1569 }
1570 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1571 // To avoid too deep recursion we push the node to the work queue and just
1572 // generate a goto here.
1573 compiler->AddWork(this);
1574 macro_assembler->GoTo(&label_);
1575 return DONE;
1576 }
1577 // Generate generic version of the node and bind the label for later use.
1578 macro_assembler->Bind(&label_);
1579 return CONTINUE;
1580 }
1581
1582 // We are being asked to make a non-generic version. Keep track of how many
1583 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00001584 trace_count_++;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001585 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00001586 trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00001587 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1588 return CONTINUE;
1589 }
1590
ager@chromium.org32912102009-01-16 10:38:43 +00001591 // If we get here code has been generated for this node too many times or
1592 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00001593 // generic versions above can handle deep recursion properly.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001594 trace->Flush(compiler, this);
1595 return DONE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001596}
1597
1598
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001599int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001600 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1601 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001602 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001603}
1604
1605
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001606int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1607 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1608 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1609}
1610
1611
1612int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1613 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1614 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1615}
1616
1617
1618int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001619 int answer = Length();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001620 if (answer >= still_to_find) return answer;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001621 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001622 return answer + on_success()->EatsAtLeast(still_to_find - answer,
1623 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001624}
1625
1626
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001627int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
1628 int recursion_depth) {
1629 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1630 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1631 // afterwards.
1632 RegExpNode* node = alternatives_->at(1).node();
1633 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
1634}
1635
1636
1637void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1638 QuickCheckDetails* details,
1639 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00001640 int filled_in,
1641 bool not_at_start) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001642 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1643 // afterwards.
1644 RegExpNode* node = alternatives_->at(1).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00001645 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001646}
1647
1648
1649int ChoiceNode::EatsAtLeastHelper(int still_to_find,
1650 int recursion_depth,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001651 RegExpNode* ignore_this_node) {
1652 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1653 int min = 100;
1654 int choice_count = alternatives_->length();
1655 for (int i = 0; i < choice_count; i++) {
1656 RegExpNode* node = alternatives_->at(i).node();
1657 if (node == ignore_this_node) continue;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001658 int node_eats_at_least = node->EatsAtLeast(still_to_find,
1659 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001660 if (node_eats_at_least < min) min = node_eats_at_least;
1661 }
1662 return min;
1663}
1664
1665
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001666int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1667 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001668}
1669
1670
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001671int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1672 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001673}
1674
1675
1676// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1677static inline uint32_t SmearBitsRight(uint32_t v) {
1678 v |= v >> 1;
1679 v |= v >> 2;
1680 v |= v >> 4;
1681 v |= v >> 8;
1682 v |= v >> 16;
1683 return v;
1684}
1685
1686
1687bool QuickCheckDetails::Rationalize(bool asc) {
1688 bool found_useful_op = false;
1689 uint32_t char_mask;
1690 if (asc) {
1691 char_mask = String::kMaxAsciiCharCode;
1692 } else {
1693 char_mask = String::kMaxUC16CharCode;
1694 }
1695 mask_ = 0;
1696 value_ = 0;
1697 int char_shift = 0;
1698 for (int i = 0; i < characters_; i++) {
1699 Position* pos = &positions_[i];
1700 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1701 found_useful_op = true;
1702 }
1703 mask_ |= (pos->mask & char_mask) << char_shift;
1704 value_ |= (pos->value & char_mask) << char_shift;
1705 char_shift += asc ? 8 : 16;
1706 }
1707 return found_useful_op;
1708}
1709
1710
1711bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001712 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001713 bool preload_has_checked_bounds,
1714 Label* on_possible_success,
1715 QuickCheckDetails* details,
1716 bool fall_through_on_failure) {
1717 if (details->characters() == 0) return false;
iposva@chromium.org245aa852009-02-10 00:49:54 +00001718 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
1719 if (details->cannot_match()) return false;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001720 if (!details->Rationalize(compiler->ascii())) return false;
1721 uint32_t mask = details->mask();
1722 uint32_t value = details->value();
1723
1724 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1725
ager@chromium.org32912102009-01-16 10:38:43 +00001726 if (trace->characters_preloaded() != details->characters()) {
1727 assembler->LoadCurrentCharacter(trace->cp_offset(),
1728 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001729 !preload_has_checked_bounds,
1730 details->characters());
1731 }
1732
1733
1734 bool need_mask = true;
1735
1736 if (details->characters() == 1) {
1737 // If number of characters preloaded is 1 then we used a byte or 16 bit
1738 // load so the value is already masked down.
1739 uint32_t char_mask;
1740 if (compiler->ascii()) {
1741 char_mask = String::kMaxAsciiCharCode;
1742 } else {
1743 char_mask = String::kMaxUC16CharCode;
1744 }
1745 if ((mask & char_mask) == char_mask) need_mask = false;
1746 mask &= char_mask;
1747 } else {
1748 // For 2-character preloads in ASCII mode we also use a 16 bit load with
1749 // zero extend.
1750 if (details->characters() == 2 && compiler->ascii()) {
1751 if ((mask & 0xffff) == 0xffff) need_mask = false;
1752 } else {
1753 if (mask == 0xffffffff) need_mask = false;
1754 }
1755 }
1756
1757 if (fall_through_on_failure) {
1758 if (need_mask) {
1759 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1760 } else {
1761 assembler->CheckCharacter(value, on_possible_success);
1762 }
1763 } else {
1764 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00001765 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001766 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00001767 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001768 }
1769 }
1770 return true;
1771}
1772
1773
1774// Here is the meat of GetQuickCheckDetails (see also the comment on the
1775// super-class in the .h file).
1776//
1777// We iterate along the text object, building up for each character a
1778// mask and value that can be used to test for a quick failure to match.
1779// The masks and values for the positions will be combined into a single
1780// machine word for the current character width in order to be used in
1781// generating a quick check.
1782void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1783 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00001784 int characters_filled_in,
1785 bool not_at_start) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001786 ASSERT(characters_filled_in < details->characters());
1787 int characters = details->characters();
1788 int char_mask;
1789 int char_shift;
1790 if (compiler->ascii()) {
1791 char_mask = String::kMaxAsciiCharCode;
1792 char_shift = 8;
1793 } else {
1794 char_mask = String::kMaxUC16CharCode;
1795 char_shift = 16;
1796 }
1797 for (int k = 0; k < elms_->length(); k++) {
1798 TextElement elm = elms_->at(k);
1799 if (elm.type == TextElement::ATOM) {
1800 Vector<const uc16> quarks = elm.data.u_atom->data();
1801 for (int i = 0; i < characters && i < quarks.length(); i++) {
1802 QuickCheckDetails::Position* pos =
1803 details->positions(characters_filled_in);
ager@chromium.org6f10e412009-02-13 10:11:16 +00001804 uc16 c = quarks[i];
1805 if (c > char_mask) {
1806 // If we expect a non-ASCII character from an ASCII string,
1807 // there is no way we can match. Not even case independent
1808 // matching can turn an ASCII character into non-ASCII or
1809 // vice versa.
1810 details->set_cannot_match();
ager@chromium.org381abbb2009-02-25 13:23:22 +00001811 pos->determines_perfectly = false;
ager@chromium.org6f10e412009-02-13 10:11:16 +00001812 return;
1813 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001814 if (compiler->ignore_case()) {
1815 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001816 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
1817 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1818 if (length == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001819 // This letter has no case equivalents, so it's nice and simple
1820 // and the mask-compare will determine definitely whether we have
1821 // a match at this character position.
1822 pos->mask = char_mask;
1823 pos->value = c;
1824 pos->determines_perfectly = true;
1825 } else {
1826 uint32_t common_bits = char_mask;
1827 uint32_t bits = chars[0];
1828 for (int j = 1; j < length; j++) {
1829 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1830 common_bits ^= differing_bits;
1831 bits &= common_bits;
1832 }
1833 // If length is 2 and common bits has only one zero in it then
1834 // our mask and compare instruction will determine definitely
1835 // whether we have a match at this character position. Otherwise
1836 // it can only be an approximate check.
1837 uint32_t one_zero = (common_bits | ~char_mask);
1838 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1839 pos->determines_perfectly = true;
1840 }
1841 pos->mask = common_bits;
1842 pos->value = bits;
1843 }
1844 } else {
1845 // Don't ignore case. Nice simple case where the mask-compare will
1846 // determine definitely whether we have a match at this character
1847 // position.
1848 pos->mask = char_mask;
ager@chromium.org6f10e412009-02-13 10:11:16 +00001849 pos->value = c;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001850 pos->determines_perfectly = true;
1851 }
1852 characters_filled_in++;
1853 ASSERT(characters_filled_in <= details->characters());
1854 if (characters_filled_in == details->characters()) {
1855 return;
1856 }
1857 }
1858 } else {
1859 QuickCheckDetails::Position* pos =
1860 details->positions(characters_filled_in);
1861 RegExpCharacterClass* tree = elm.data.u_char_class;
1862 ZoneList<CharacterRange>* ranges = tree->ranges();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001863 if (tree->is_negated()) {
1864 // A quick check uses multi-character mask and compare. There is no
1865 // useful way to incorporate a negative char class into this scheme
1866 // so we just conservatively create a mask and value that will always
1867 // succeed.
1868 pos->mask = 0;
1869 pos->value = 0;
1870 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001871 int first_range = 0;
1872 while (ranges->at(first_range).from() > char_mask) {
1873 first_range++;
1874 if (first_range == ranges->length()) {
1875 details->set_cannot_match();
1876 pos->determines_perfectly = false;
1877 return;
1878 }
1879 }
1880 CharacterRange range = ranges->at(first_range);
1881 uc16 from = range.from();
1882 uc16 to = range.to();
1883 if (to > char_mask) {
1884 to = char_mask;
1885 }
1886 uint32_t differing_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001887 // A mask and compare is only perfect if the differing bits form a
1888 // number like 00011111 with one single block of trailing 1s.
ager@chromium.org5aa501c2009-06-23 07:57:28 +00001889 if ((differing_bits & (differing_bits + 1)) == 0 &&
1890 from + differing_bits == to) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001891 pos->determines_perfectly = true;
1892 }
1893 uint32_t common_bits = ~SmearBitsRight(differing_bits);
ager@chromium.org381abbb2009-02-25 13:23:22 +00001894 uint32_t bits = (from & common_bits);
1895 for (int i = first_range + 1; i < ranges->length(); i++) {
1896 CharacterRange range = ranges->at(i);
1897 uc16 from = range.from();
1898 uc16 to = range.to();
1899 if (from > char_mask) continue;
1900 if (to > char_mask) to = char_mask;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001901 // Here we are combining more ranges into the mask and compare
1902 // value. With each new range the mask becomes more sparse and
1903 // so the chances of a false positive rise. A character class
1904 // with multiple ranges is assumed never to be equivalent to a
1905 // mask and compare operation.
1906 pos->determines_perfectly = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001907 uint32_t new_common_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001908 new_common_bits = ~SmearBitsRight(new_common_bits);
1909 common_bits &= new_common_bits;
1910 bits &= new_common_bits;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001911 uint32_t differing_bits = (from & common_bits) ^ bits;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001912 common_bits ^= differing_bits;
1913 bits &= common_bits;
1914 }
1915 pos->mask = common_bits;
1916 pos->value = bits;
1917 }
1918 characters_filled_in++;
1919 ASSERT(characters_filled_in <= details->characters());
1920 if (characters_filled_in == details->characters()) {
1921 return;
1922 }
1923 }
1924 }
1925 ASSERT(characters_filled_in != details->characters());
iposva@chromium.org245aa852009-02-10 00:49:54 +00001926 on_success()-> GetQuickCheckDetails(details,
1927 compiler,
1928 characters_filled_in,
1929 true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001930}
1931
1932
1933void QuickCheckDetails::Clear() {
1934 for (int i = 0; i < characters_; i++) {
1935 positions_[i].mask = 0;
1936 positions_[i].value = 0;
1937 positions_[i].determines_perfectly = false;
1938 }
1939 characters_ = 0;
1940}
1941
1942
1943void QuickCheckDetails::Advance(int by, bool ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001944 ASSERT(by >= 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001945 if (by >= characters_) {
1946 Clear();
1947 return;
1948 }
1949 for (int i = 0; i < characters_ - by; i++) {
1950 positions_[i] = positions_[by + i];
1951 }
1952 for (int i = characters_ - by; i < characters_; i++) {
1953 positions_[i].mask = 0;
1954 positions_[i].value = 0;
1955 positions_[i].determines_perfectly = false;
1956 }
1957 characters_ -= by;
1958 // We could change mask_ and value_ here but we would never advance unless
1959 // they had already been used in a check and they won't be used again because
1960 // it would gain us nothing. So there's no point.
1961}
1962
1963
1964void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1965 ASSERT(characters_ == other->characters_);
iposva@chromium.org245aa852009-02-10 00:49:54 +00001966 if (other->cannot_match_) {
1967 return;
1968 }
1969 if (cannot_match_) {
1970 *this = *other;
1971 return;
1972 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001973 for (int i = from_index; i < characters_; i++) {
1974 QuickCheckDetails::Position* pos = positions(i);
1975 QuickCheckDetails::Position* other_pos = other->positions(i);
1976 if (pos->mask != other_pos->mask ||
1977 pos->value != other_pos->value ||
1978 !other_pos->determines_perfectly) {
1979 // Our mask-compare operation will be approximate unless we have the
1980 // exact same operation on both sides of the alternation.
1981 pos->determines_perfectly = false;
1982 }
1983 pos->mask &= other_pos->mask;
1984 pos->value &= pos->mask;
1985 other_pos->value &= pos->mask;
1986 uc16 differing_bits = (pos->value ^ other_pos->value);
1987 pos->mask &= ~differing_bits;
1988 pos->value &= pos->mask;
1989 }
1990}
1991
1992
ager@chromium.org32912102009-01-16 10:38:43 +00001993class VisitMarker {
1994 public:
1995 explicit VisitMarker(NodeInfo* info) : info_(info) {
1996 ASSERT(!info->visited);
1997 info->visited = true;
1998 }
1999 ~VisitMarker() {
2000 info_->visited = false;
2001 }
2002 private:
2003 NodeInfo* info_;
2004};
2005
2006
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002007void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2008 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002009 int characters_filled_in,
2010 bool not_at_start) {
ager@chromium.org32912102009-01-16 10:38:43 +00002011 if (body_can_be_zero_length_ || info()->visited) return;
2012 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002013 return ChoiceNode::GetQuickCheckDetails(details,
2014 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002015 characters_filled_in,
2016 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002017}
2018
2019
2020void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2021 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002022 int characters_filled_in,
2023 bool not_at_start) {
2024 not_at_start = (not_at_start || not_at_start_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002025 int choice_count = alternatives_->length();
2026 ASSERT(choice_count > 0);
2027 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2028 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002029 characters_filled_in,
2030 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002031 for (int i = 1; i < choice_count; i++) {
2032 QuickCheckDetails new_details(details->characters());
2033 RegExpNode* node = alternatives_->at(i).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002034 node->GetQuickCheckDetails(&new_details, compiler,
2035 characters_filled_in,
2036 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002037 // Here we merge the quick match details of the two branches.
2038 details->Merge(&new_details, characters_filled_in);
2039 }
2040}
2041
2042
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002043// Check for [0-9A-Z_a-z].
2044static void EmitWordCheck(RegExpMacroAssembler* assembler,
2045 Label* word,
2046 Label* non_word,
2047 bool fall_through_on_word) {
2048 assembler->CheckCharacterGT('z', non_word);
2049 assembler->CheckCharacterLT('0', non_word);
2050 assembler->CheckCharacterGT('a' - 1, word);
2051 assembler->CheckCharacterLT('9' + 1, word);
2052 assembler->CheckCharacterLT('A', non_word);
2053 assembler->CheckCharacterLT('Z' + 1, word);
2054 if (fall_through_on_word) {
2055 assembler->CheckNotCharacter('_', non_word);
2056 } else {
2057 assembler->CheckCharacter('_', word);
2058 }
2059}
2060
2061
2062// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2063// that matches newline or the start of input).
2064static void EmitHat(RegExpCompiler* compiler,
2065 RegExpNode* on_success,
2066 Trace* trace) {
2067 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2068 // We will be loading the previous character into the current character
2069 // register.
2070 Trace new_trace(*trace);
2071 new_trace.InvalidateCurrentCharacter();
2072
2073 Label ok;
2074 if (new_trace.cp_offset() == 0) {
2075 // The start of input counts as a newline in this context, so skip to
2076 // ok if we are at the start.
2077 assembler->CheckAtStart(&ok);
2078 }
2079 // We already checked that we are not at the start of input so it must be
2080 // OK to load the previous character.
2081 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2082 new_trace.backtrack(),
2083 false);
2084 // Newline means \n, \r, 0x2028 or 0x2029.
2085 if (!compiler->ascii()) {
2086 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2087 }
2088 assembler->CheckCharacter('\n', &ok);
2089 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2090 assembler->Bind(&ok);
2091 on_success->Emit(compiler, &new_trace);
2092}
2093
2094
2095// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2096static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2097 RegExpCompiler* compiler,
2098 RegExpNode* on_success,
2099 Trace* trace) {
2100 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2101 Label before_non_word;
2102 Label before_word;
2103 if (trace->characters_preloaded() != 1) {
2104 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2105 }
2106 // Fall through on non-word.
2107 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2108
2109 // We will be loading the previous character into the current character
2110 // register.
2111 Trace new_trace(*trace);
2112 new_trace.InvalidateCurrentCharacter();
2113
2114 Label ok;
2115 Label* boundary;
2116 Label* not_boundary;
2117 if (type == AssertionNode::AT_BOUNDARY) {
2118 boundary = &ok;
2119 not_boundary = new_trace.backtrack();
2120 } else {
2121 not_boundary = &ok;
2122 boundary = new_trace.backtrack();
2123 }
2124
2125 // Next character is not a word character.
2126 assembler->Bind(&before_non_word);
2127 if (new_trace.cp_offset() == 0) {
2128 // The start of input counts as a non-word character, so the question is
2129 // decided if we are at the start.
2130 assembler->CheckAtStart(not_boundary);
2131 }
2132 // We already checked that we are not at the start of input so it must be
2133 // OK to load the previous character.
2134 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2135 &ok, // Unused dummy label in this call.
2136 false);
2137 // Fall through on non-word.
2138 EmitWordCheck(assembler, boundary, not_boundary, false);
2139 assembler->GoTo(not_boundary);
2140
2141 // Next character is a word character.
2142 assembler->Bind(&before_word);
2143 if (new_trace.cp_offset() == 0) {
2144 // The start of input counts as a non-word character, so the question is
2145 // decided if we are at the start.
2146 assembler->CheckAtStart(boundary);
2147 }
2148 // We already checked that we are not at the start of input so it must be
2149 // OK to load the previous character.
2150 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2151 &ok, // Unused dummy label in this call.
2152 false);
2153 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2154 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2155
2156 assembler->Bind(&ok);
2157
2158 on_success->Emit(compiler, &new_trace);
2159}
2160
2161
iposva@chromium.org245aa852009-02-10 00:49:54 +00002162void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2163 RegExpCompiler* compiler,
2164 int filled_in,
2165 bool not_at_start) {
2166 if (type_ == AT_START && not_at_start) {
2167 details->set_cannot_match();
2168 return;
2169 }
2170 return on_success()->GetQuickCheckDetails(details,
2171 compiler,
2172 filled_in,
2173 not_at_start);
2174}
2175
2176
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002177void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2178 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2179 switch (type_) {
2180 case AT_END: {
2181 Label ok;
2182 assembler->CheckPosition(trace->cp_offset(), &ok);
2183 assembler->GoTo(trace->backtrack());
2184 assembler->Bind(&ok);
2185 break;
2186 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002187 case AT_START: {
2188 if (trace->at_start() == Trace::FALSE) {
2189 assembler->GoTo(trace->backtrack());
2190 return;
2191 }
2192 if (trace->at_start() == Trace::UNKNOWN) {
2193 assembler->CheckNotAtStart(trace->backtrack());
2194 Trace at_start_trace = *trace;
2195 at_start_trace.set_at_start(true);
2196 on_success()->Emit(compiler, &at_start_trace);
2197 return;
2198 }
2199 }
2200 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002201 case AFTER_NEWLINE:
2202 EmitHat(compiler, on_success(), trace);
2203 return;
2204 case AT_NON_BOUNDARY:
2205 case AT_BOUNDARY:
2206 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2207 return;
2208 }
2209 on_success()->Emit(compiler, trace);
2210}
2211
2212
ager@chromium.org381abbb2009-02-25 13:23:22 +00002213static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2214 if (quick_check == NULL) return false;
2215 if (offset >= quick_check->characters()) return false;
2216 return quick_check->positions(offset)->determines_perfectly;
2217}
2218
2219
2220static void UpdateBoundsCheck(int index, int* checked_up_to) {
2221 if (index > *checked_up_to) {
2222 *checked_up_to = index;
2223 }
2224}
2225
2226
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002227// We call this repeatedly to generate code for each pass over the text node.
2228// The passes are in increasing order of difficulty because we hope one
2229// of the first passes will fail in which case we are saved the work of the
2230// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2231// we will check the '%' in the first pass, the case independent 'a' in the
2232// second pass and the character class in the last pass.
2233//
2234// The passes are done from right to left, so for example to test for /bar/
2235// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2236// and then a 'b' with offset 0. This means we can avoid the end-of-input
2237// bounds check most of the time. In the example we only need to check for
2238// end-of-input when loading the putative 'r'.
2239//
2240// A slight complication involves the fact that the first character may already
2241// be fetched into a register by the previous node. In this case we want to
2242// do the test for that character first. We do this in separate passes. The
2243// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2244// pass has been performed then subsequent passes will have true in
2245// first_element_checked to indicate that that character does not need to be
2246// checked again.
2247//
ager@chromium.org32912102009-01-16 10:38:43 +00002248// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002249// contain an AlternativeGeneration object. In this AlternativeGeneration
2250// object we can see details of any quick check that was already passed in
2251// order to get to the code we are now generating. The quick check can involve
2252// loading characters, which means we do not need to recheck the bounds
2253// up to the limit the quick check already checked. In addition the quick
2254// check can have involved a mask and compare operation which may simplify
2255// or obviate the need for further checks at some character positions.
2256void TextNode::TextEmitPass(RegExpCompiler* compiler,
2257 TextEmitPassType pass,
2258 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00002259 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002260 bool first_element_checked,
2261 int* checked_up_to) {
2262 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2263 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00002264 Label* backtrack = trace->backtrack();
2265 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002266 int element_count = elms_->length();
2267 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2268 TextElement elm = elms_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002269 int cp_offset = trace->cp_offset() + elm.cp_offset;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002270 if (elm.type == TextElement::ATOM) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002271 Vector<const uc16> quarks = elm.data.u_atom->data();
2272 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2273 if (first_element_checked && i == 0 && j == 0) continue;
2274 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2275 EmitCharacterFunction* emit_function = NULL;
2276 switch (pass) {
2277 case NON_ASCII_MATCH:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002278 ASSERT(ascii);
2279 if (quarks[j] > String::kMaxAsciiCharCode) {
2280 assembler->GoTo(backtrack);
2281 return;
2282 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002283 break;
2284 case NON_LETTER_CHARACTER_MATCH:
2285 emit_function = &EmitAtomNonLetter;
2286 break;
2287 case SIMPLE_CHARACTER_MATCH:
2288 emit_function = &EmitSimpleCharacter;
2289 break;
2290 case CASE_CHARACTER_MATCH:
2291 emit_function = &EmitAtomLetter;
2292 break;
2293 default:
2294 break;
2295 }
2296 if (emit_function != NULL) {
2297 bool bound_checked = emit_function(compiler,
ager@chromium.org6f10e412009-02-13 10:11:16 +00002298 quarks[j],
2299 backtrack,
2300 cp_offset + j,
2301 *checked_up_to < cp_offset + j,
2302 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002303 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002304 }
2305 }
2306 } else {
2307 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002308 if (pass == CHARACTER_CLASS_MATCH) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002309 if (first_element_checked && i == 0) continue;
2310 if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002311 RegExpCharacterClass* cc = elm.data.u_char_class;
2312 EmitCharClass(assembler,
2313 cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002314 ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002315 backtrack,
2316 cp_offset,
2317 *checked_up_to < cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002318 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002319 UpdateBoundsCheck(cp_offset, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002320 }
2321 }
2322 }
2323}
2324
2325
2326int TextNode::Length() {
2327 TextElement elm = elms_->last();
2328 ASSERT(elm.cp_offset >= 0);
2329 if (elm.type == TextElement::ATOM) {
2330 return elm.cp_offset + elm.data.u_atom->data().length();
2331 } else {
2332 return elm.cp_offset + 1;
2333 }
2334}
2335
2336
ager@chromium.org381abbb2009-02-25 13:23:22 +00002337bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2338 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2339 if (ignore_case) {
2340 return pass == SIMPLE_CHARACTER_MATCH;
2341 } else {
2342 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2343 }
2344}
2345
2346
ager@chromium.org8bb60582008-12-11 12:02:20 +00002347// This generates the code to match a text node. A text node can contain
2348// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002349// way) and character classes. For efficiency we do not do this in a single
2350// pass from left to right. Instead we pass over the text node several times,
2351// emitting code for some character positions every time. See the comment on
2352// TextEmitPass for details.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002353void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00002354 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002355 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002356 ASSERT(limit_result == CONTINUE);
2357
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002358 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2359 compiler->SetRegExpTooBig();
2360 return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002361 }
2362
2363 if (compiler->ascii()) {
2364 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00002365 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002366 }
2367
2368 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002369 int bound_checked_to = trace->cp_offset() - 1;
2370 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002371
2372 // If a character is preloaded into the current character register then
2373 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00002374 if (trace->characters_preloaded() == 1) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002375 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2376 if (!SkipPass(pass, compiler->ignore_case())) {
2377 TextEmitPass(compiler,
2378 static_cast<TextEmitPassType>(pass),
2379 true,
2380 trace,
2381 false,
2382 &bound_checked_to);
2383 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002384 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002385 first_elt_done = true;
2386 }
2387
ager@chromium.org381abbb2009-02-25 13:23:22 +00002388 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2389 if (!SkipPass(pass, compiler->ignore_case())) {
2390 TextEmitPass(compiler,
2391 static_cast<TextEmitPassType>(pass),
2392 false,
2393 trace,
2394 first_elt_done,
2395 &bound_checked_to);
2396 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002397 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002398
ager@chromium.org32912102009-01-16 10:38:43 +00002399 Trace successor_trace(*trace);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002400 successor_trace.set_at_start(false);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002401 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002402 RecursionCheck rc(compiler);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002403 on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002404}
2405
2406
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002407void Trace::InvalidateCurrentCharacter() {
2408 characters_preloaded_ = 0;
2409}
2410
2411
2412void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002413 ASSERT(by > 0);
2414 // We don't have an instruction for shifting the current character register
2415 // down or for using a shifted value for anything so lets just forget that
2416 // we preloaded any characters into it.
2417 characters_preloaded_ = 0;
2418 // Adjust the offsets of the quick check performed information. This
2419 // information is used to find out what we already determined about the
2420 // characters by means of mask and compare.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002421 quick_check_performed_.Advance(by, compiler->ascii());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002422 cp_offset_ += by;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002423 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2424 compiler->SetRegExpTooBig();
2425 cp_offset_ = 0;
2426 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002427 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002428}
2429
2430
2431void TextNode::MakeCaseIndependent() {
2432 int element_count = elms_->length();
2433 for (int i = 0; i < element_count; i++) {
2434 TextElement elm = elms_->at(i);
2435 if (elm.type == TextElement::CHAR_CLASS) {
2436 RegExpCharacterClass* cc = elm.data.u_char_class;
2437 ZoneList<CharacterRange>* ranges = cc->ranges();
2438 int range_count = ranges->length();
2439 for (int i = 0; i < range_count; i++) {
2440 ranges->at(i).AddCaseEquivalents(ranges);
2441 }
2442 }
2443 }
2444}
2445
2446
ager@chromium.org8bb60582008-12-11 12:02:20 +00002447int TextNode::GreedyLoopTextLength() {
2448 TextElement elm = elms_->at(elms_->length() - 1);
2449 if (elm.type == TextElement::CHAR_CLASS) {
2450 return elm.cp_offset + 1;
2451 } else {
2452 return elm.cp_offset + elm.data.u_atom->data().length();
2453 }
2454}
2455
2456
2457// Finds the fixed match length of a sequence of nodes that goes from
2458// this alternative and back to this choice node. If there are variable
2459// length nodes or other complications in the way then return a sentinel
2460// value indicating that a greedy loop cannot be constructed.
2461int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2462 int length = 0;
2463 RegExpNode* node = alternative->node();
2464 // Later we will generate code for all these text nodes using recursion
2465 // so we have to limit the max number.
2466 int recursion_depth = 0;
2467 while (node != this) {
2468 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2469 return kNodeIsTooComplexForGreedyLoops;
2470 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002471 int node_length = node->GreedyLoopTextLength();
2472 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2473 return kNodeIsTooComplexForGreedyLoops;
2474 }
2475 length += node_length;
2476 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2477 node = seq_node->on_success();
2478 }
2479 return length;
2480}
2481
2482
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002483void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2484 ASSERT_EQ(loop_node_, NULL);
2485 AddAlternative(alt);
2486 loop_node_ = alt.node();
2487}
2488
2489
2490void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2491 ASSERT_EQ(continue_node_, NULL);
2492 AddAlternative(alt);
2493 continue_node_ = alt.node();
2494}
2495
2496
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002497void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002498 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002499 if (trace->stop_node() == this) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002500 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2501 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2502 // Update the counter-based backtracking info on the stack. This is an
2503 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00002504 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002505 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00002506 macro_assembler->GoTo(trace->loop_label());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002507 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002508 }
ager@chromium.org32912102009-01-16 10:38:43 +00002509 ASSERT(trace->stop_node() == NULL);
2510 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002511 trace->Flush(compiler, this);
2512 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002513 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002514 ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002515}
2516
2517
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002518int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002519 int preload_characters = EatsAtLeast(4, 0);
ager@chromium.org9085a012009-05-11 19:22:57 +00002520#ifdef V8_HOST_CAN_READ_UNALIGNED
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002521 bool ascii = compiler->ascii();
2522 if (ascii) {
2523 if (preload_characters > 4) preload_characters = 4;
2524 // We can't preload 3 characters because there is no machine instruction
2525 // to do that. We can't just load 4 because we could be reading
2526 // beyond the end of the string, which could cause a memory fault.
2527 if (preload_characters == 3) preload_characters = 2;
2528 } else {
2529 if (preload_characters > 2) preload_characters = 2;
2530 }
2531#else
2532 if (preload_characters > 1) preload_characters = 1;
2533#endif
2534 return preload_characters;
2535}
2536
2537
2538// This class is used when generating the alternatives in a choice node. It
2539// records the way the alternative is being code generated.
2540class AlternativeGeneration: public Malloced {
2541 public:
2542 AlternativeGeneration()
2543 : possible_success(),
2544 expects_preload(false),
2545 after(),
2546 quick_check_details() { }
2547 Label possible_success;
2548 bool expects_preload;
2549 Label after;
2550 QuickCheckDetails quick_check_details;
2551};
2552
2553
2554// Creates a list of AlternativeGenerations. If the list has a reasonable
2555// size then it is on the stack, otherwise the excess is on the heap.
2556class AlternativeGenerationList {
2557 public:
2558 explicit AlternativeGenerationList(int count)
2559 : alt_gens_(count) {
2560 for (int i = 0; i < count && i < kAFew; i++) {
2561 alt_gens_.Add(a_few_alt_gens_ + i);
2562 }
2563 for (int i = kAFew; i < count; i++) {
2564 alt_gens_.Add(new AlternativeGeneration());
2565 }
2566 }
2567 ~AlternativeGenerationList() {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002568 for (int i = kAFew; i < alt_gens_.length(); i++) {
2569 delete alt_gens_[i];
2570 alt_gens_[i] = NULL;
2571 }
2572 }
2573
2574 AlternativeGeneration* at(int i) {
2575 return alt_gens_[i];
2576 }
2577 private:
2578 static const int kAFew = 10;
2579 ZoneList<AlternativeGeneration*> alt_gens_;
2580 AlternativeGeneration a_few_alt_gens_[kAFew];
2581};
2582
2583
2584/* Code generation for choice nodes.
2585 *
2586 * We generate quick checks that do a mask and compare to eliminate a
2587 * choice. If the quick check succeeds then it jumps to the continuation to
2588 * do slow checks and check subsequent nodes. If it fails (the common case)
2589 * it falls through to the next choice.
2590 *
2591 * Here is the desired flow graph. Nodes directly below each other imply
2592 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2593 * 3 doesn't have a quick check so we have to call the slow check.
2594 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2595 * regexp continuation is generated directly after the Sn node, up to the
2596 * next GoTo if we decide to reuse some already generated code. Some
2597 * nodes expect preload_characters to be preloaded into the current
2598 * character register. R nodes do this preloading. Vertices are marked
2599 * F for failures and S for success (possible success in the case of quick
2600 * nodes). L, V, < and > are used as arrow heads.
2601 *
2602 * ----------> R
2603 * |
2604 * V
2605 * Q1 -----> S1
2606 * | S /
2607 * F| /
2608 * | F/
2609 * | /
2610 * | R
2611 * | /
2612 * V L
2613 * Q2 -----> S2
2614 * | S /
2615 * F| /
2616 * | F/
2617 * | /
2618 * | R
2619 * | /
2620 * V L
2621 * S3
2622 * |
2623 * F|
2624 * |
2625 * R
2626 * |
2627 * backtrack V
2628 * <----------Q4
2629 * \ F |
2630 * \ |S
2631 * \ F V
2632 * \-----S4
2633 *
2634 * For greedy loops we reverse our expectation and expect to match rather
2635 * than fail. Therefore we want the loop code to look like this (U is the
2636 * unwind code that steps back in the greedy loop). The following alternatives
2637 * look the same as above.
2638 * _____
2639 * / \
2640 * V |
2641 * ----------> S1 |
2642 * /| |
2643 * / |S |
2644 * F/ \_____/
2645 * /
2646 * |<-----------
2647 * | \
2648 * V \
2649 * Q2 ---> S2 \
2650 * | S / |
2651 * F| / |
2652 * | F/ |
2653 * | / |
2654 * | R |
2655 * | / |
2656 * F VL |
2657 * <------U |
2658 * back |S |
2659 * \______________/
2660 */
2661
2662
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002663void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002664 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2665 int choice_count = alternatives_->length();
2666#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002667 for (int i = 0; i < choice_count - 1; i++) {
2668 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002669 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002670 int guard_count = (guards == NULL) ? 0 : guards->length();
2671 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002672 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002673 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002674 }
2675#endif
2676
ager@chromium.org32912102009-01-16 10:38:43 +00002677 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002678 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002679 ASSERT(limit_result == CONTINUE);
2680
ager@chromium.org381abbb2009-02-25 13:23:22 +00002681 int new_flush_budget = trace->flush_budget() / choice_count;
2682 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
2683 trace->Flush(compiler, this);
2684 return;
2685 }
2686
ager@chromium.org8bb60582008-12-11 12:02:20 +00002687 RecursionCheck rc(compiler);
2688
ager@chromium.org32912102009-01-16 10:38:43 +00002689 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002690
2691 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2692 bool greedy_loop = false;
2693 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00002694 Trace counter_backtrack_trace;
2695 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002696 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
2697
ager@chromium.org8bb60582008-12-11 12:02:20 +00002698 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2699 // Here we have special handling for greedy loops containing only text nodes
2700 // and other simple nodes. These are handled by pushing the current
2701 // position on the stack and then incrementing the current position each
2702 // time around the switch. On backtrack we decrement the current position
2703 // and check it against the pushed value. This avoids pushing backtrack
2704 // information for each iteration of the loop, which could take up a lot of
2705 // space.
2706 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00002707 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002708 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00002709 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002710 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00002711 Trace greedy_match_trace;
iposva@chromium.org245aa852009-02-10 00:49:54 +00002712 if (not_at_start()) greedy_match_trace.set_at_start(false);
ager@chromium.org32912102009-01-16 10:38:43 +00002713 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002714 Label loop_label;
2715 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00002716 greedy_match_trace.set_stop_node(this);
2717 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002718 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002719 macro_assembler->Bind(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002720 }
2721
2722 Label second_choice; // For use in greedy matches.
2723 macro_assembler->Bind(&second_choice);
2724
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002725 int first_normal_choice = greedy_loop ? 1 : 0;
2726
2727 int preload_characters = CalculatePreloadCharacters(compiler);
2728 bool preload_is_current =
ager@chromium.org32912102009-01-16 10:38:43 +00002729 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002730 bool preload_has_checked_bounds = preload_is_current;
2731
2732 AlternativeGenerationList alt_gens(choice_count);
2733
ager@chromium.org8bb60582008-12-11 12:02:20 +00002734 // For now we just call all choices one after the other. The idea ultimately
2735 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002736 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002737 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002738 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002739 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002740 ZoneList<Guard*>* guards = alternative.guards();
2741 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00002742 Trace new_trace(*current_trace);
2743 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002744 preload_characters :
2745 0);
2746 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00002747 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002748 }
ager@chromium.org32912102009-01-16 10:38:43 +00002749 new_trace.quick_check_performed()->Clear();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002750 if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002751 alt_gen->expects_preload = preload_is_current;
2752 bool generate_full_check_inline = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002753 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00002754 try_to_emit_quick_check_for_alternative(i) &&
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002755 alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002756 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002757 preload_has_checked_bounds,
2758 &alt_gen->possible_success,
2759 &alt_gen->quick_check_details,
2760 i < choice_count - 1)) {
2761 // Quick check was generated for this choice.
2762 preload_is_current = true;
2763 preload_has_checked_bounds = true;
2764 // On the last choice in the ChoiceNode we generated the quick
2765 // check to fall through on possible success. So now we need to
2766 // generate the full check inline.
2767 if (i == choice_count - 1) {
2768 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00002769 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2770 new_trace.set_characters_preloaded(preload_characters);
2771 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002772 generate_full_check_inline = true;
2773 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002774 } else if (alt_gen->quick_check_details.cannot_match()) {
2775 if (i == choice_count - 1 && !greedy_loop) {
2776 macro_assembler->GoTo(trace->backtrack());
2777 }
2778 continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002779 } else {
2780 // No quick check was generated. Put the full code here.
2781 // If this is not the first choice then there could be slow checks from
2782 // previous cases that go here when they fail. There's no reason to
2783 // insist that they preload characters since the slow check we are about
2784 // to generate probably can't use it.
2785 if (i != first_normal_choice) {
2786 alt_gen->expects_preload = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002787 new_trace.set_characters_preloaded(0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002788 }
2789 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00002790 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002791 }
2792 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002793 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002794 if (generate_full_check_inline) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002795 if (new_trace.actions() != NULL) {
2796 new_trace.set_flush_budget(new_flush_budget);
2797 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002798 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002799 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002800 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002801 alternative.node()->Emit(compiler, &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002802 preload_is_current = false;
2803 }
2804 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002805 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002806 if (greedy_loop) {
2807 macro_assembler->Bind(&greedy_loop_label);
2808 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00002809 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00002810 // Otherwise try the second priority at an earlier position.
2811 macro_assembler->AdvanceCurrentPosition(-text_length);
2812 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002813 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002814
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002815 // At this point we need to generate slow checks for the alternatives where
2816 // the quick check was inlined. We can recognize these because the associated
2817 // label was bound.
2818 for (int i = first_normal_choice; i < choice_count - 1; i++) {
2819 AlternativeGeneration* alt_gen = alt_gens.at(i);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002820 Trace new_trace(*current_trace);
2821 // If there are actions to be flushed we have to limit how many times
2822 // they are flushed. Take the budget of the parent trace and distribute
2823 // it fairly amongst the children.
2824 if (new_trace.actions() != NULL) {
2825 new_trace.set_flush_budget(new_flush_budget);
2826 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002827 EmitOutOfLineContinuation(compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002828 &new_trace,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002829 alternatives_->at(i),
2830 alt_gen,
2831 preload_characters,
2832 alt_gens.at(i + 1)->expects_preload);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002833 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002834}
2835
2836
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002837void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002838 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002839 GuardedAlternative alternative,
2840 AlternativeGeneration* alt_gen,
2841 int preload_characters,
2842 bool next_expects_preload) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002843 if (!alt_gen->possible_success.is_linked()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002844
2845 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2846 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00002847 Trace out_of_line_trace(*trace);
2848 out_of_line_trace.set_characters_preloaded(preload_characters);
2849 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002850 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002851 ZoneList<Guard*>* guards = alternative.guards();
2852 int guard_count = (guards == NULL) ? 0 : guards->length();
2853 if (next_expects_preload) {
2854 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00002855 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002856 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002857 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002858 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002859 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002860 macro_assembler->Bind(&reload_current_char);
2861 // Reload the current character, since the next quick check expects that.
2862 // We don't need to check bounds here because we only get into this
2863 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00002864 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002865 NULL,
2866 false,
2867 preload_characters);
2868 macro_assembler->GoTo(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002869 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00002870 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002871 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002872 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002873 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002874 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002875 }
2876}
2877
2878
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002879void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002880 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002881 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002882 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002883 ASSERT(limit_result == CONTINUE);
2884
2885 RecursionCheck rc(compiler);
2886
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002887 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002888 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00002889 Trace::DeferredCapture
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002890 new_capture(data_.u_position_register.reg,
2891 data_.u_position_register.is_capture,
2892 trace);
ager@chromium.org32912102009-01-16 10:38:43 +00002893 Trace new_trace = *trace;
2894 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002895 on_success()->Emit(compiler, &new_trace);
2896 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002897 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002898 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00002899 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00002900 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00002901 Trace new_trace = *trace;
2902 new_trace.add_action(&new_increment);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002903 on_success()->Emit(compiler, &new_trace);
2904 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002905 }
2906 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00002907 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00002908 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00002909 Trace new_trace = *trace;
2910 new_trace.add_action(&new_set);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002911 on_success()->Emit(compiler, &new_trace);
2912 break;
ager@chromium.org32912102009-01-16 10:38:43 +00002913 }
2914 case CLEAR_CAPTURES: {
2915 Trace::DeferredClearCaptures
2916 new_capture(Interval(data_.u_clear_captures.range_from,
2917 data_.u_clear_captures.range_to));
2918 Trace new_trace = *trace;
2919 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002920 on_success()->Emit(compiler, &new_trace);
2921 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002922 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002923 case BEGIN_SUBMATCH:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002924 if (!trace->is_trivial()) {
2925 trace->Flush(compiler, this);
2926 } else {
2927 assembler->WriteCurrentPositionToRegister(
2928 data_.u_submatch.current_position_register, 0);
2929 assembler->WriteStackPointerToRegister(
2930 data_.u_submatch.stack_pointer_register);
2931 on_success()->Emit(compiler, trace);
2932 }
2933 break;
ager@chromium.org32912102009-01-16 10:38:43 +00002934 case EMPTY_MATCH_CHECK: {
2935 int start_pos_reg = data_.u_empty_match_check.start_register;
2936 int stored_pos = 0;
2937 int rep_reg = data_.u_empty_match_check.repetition_register;
2938 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
2939 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
2940 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
2941 // If we know we haven't advanced and there is no minimum we
2942 // can just backtrack immediately.
2943 assembler->GoTo(trace->backtrack());
ager@chromium.org32912102009-01-16 10:38:43 +00002944 } else if (know_dist && stored_pos < trace->cp_offset()) {
2945 // If we know we've advanced we can generate the continuation
2946 // immediately.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002947 on_success()->Emit(compiler, trace);
2948 } else if (!trace->is_trivial()) {
2949 trace->Flush(compiler, this);
2950 } else {
2951 Label skip_empty_check;
2952 // If we have a minimum number of repetitions we check the current
2953 // number first and skip the empty check if it's not enough.
2954 if (has_minimum) {
2955 int limit = data_.u_empty_match_check.repetition_limit;
2956 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
2957 }
2958 // If the match is empty we bail out, otherwise we fall through
2959 // to the on-success continuation.
2960 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
2961 trace->backtrack());
2962 assembler->Bind(&skip_empty_check);
2963 on_success()->Emit(compiler, trace);
ager@chromium.org32912102009-01-16 10:38:43 +00002964 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002965 break;
ager@chromium.org32912102009-01-16 10:38:43 +00002966 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002967 case POSITIVE_SUBMATCH_SUCCESS: {
2968 if (!trace->is_trivial()) {
2969 trace->Flush(compiler, this);
2970 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002971 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002972 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00002973 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002974 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002975 data_.u_submatch.stack_pointer_register);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002976 int clear_register_count = data_.u_submatch.clear_register_count;
2977 if (clear_register_count == 0) {
2978 on_success()->Emit(compiler, trace);
2979 return;
2980 }
2981 int clear_registers_from = data_.u_submatch.clear_register_from;
2982 Label clear_registers_backtrack;
2983 Trace new_trace = *trace;
2984 new_trace.set_backtrack(&clear_registers_backtrack);
2985 on_success()->Emit(compiler, &new_trace);
2986
2987 assembler->Bind(&clear_registers_backtrack);
2988 int clear_registers_to = clear_registers_from + clear_register_count - 1;
2989 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
2990
2991 ASSERT(trace->backtrack() == NULL);
2992 assembler->Backtrack();
2993 return;
2994 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002995 default:
2996 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002997 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002998}
2999
3000
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003001void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003002 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003003 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003004 trace->Flush(compiler, this);
3005 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003006 }
3007
ager@chromium.org32912102009-01-16 10:38:43 +00003008 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003009 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003010 ASSERT(limit_result == CONTINUE);
3011
3012 RecursionCheck rc(compiler);
3013
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003014 ASSERT_EQ(start_reg_ + 1, end_reg_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003015 if (compiler->ignore_case()) {
3016 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3017 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003018 } else {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003019 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003020 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003021 on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003022}
3023
3024
3025// -------------------------------------------------------------------
3026// Dot/dotty output
3027
3028
3029#ifdef DEBUG
3030
3031
3032class DotPrinter: public NodeVisitor {
3033 public:
3034 explicit DotPrinter(bool ignore_case)
3035 : ignore_case_(ignore_case),
3036 stream_(&alloc_) { }
3037 void PrintNode(const char* label, RegExpNode* node);
3038 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003039 void PrintAttributes(RegExpNode* from);
3040 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003041 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003042#define DECLARE_VISIT(Type) \
3043 virtual void Visit##Type(Type##Node* that);
3044FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3045#undef DECLARE_VISIT
3046 private:
3047 bool ignore_case_;
3048 HeapStringAllocator alloc_;
3049 StringStream stream_;
3050};
3051
3052
3053void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3054 stream()->Add("digraph G {\n graph [label=\"");
3055 for (int i = 0; label[i]; i++) {
3056 switch (label[i]) {
3057 case '\\':
3058 stream()->Add("\\\\");
3059 break;
3060 case '"':
3061 stream()->Add("\"");
3062 break;
3063 default:
3064 stream()->Put(label[i]);
3065 break;
3066 }
3067 }
3068 stream()->Add("\"];\n");
3069 Visit(node);
3070 stream()->Add("}\n");
3071 printf("%s", *(stream()->ToCString()));
3072}
3073
3074
3075void DotPrinter::Visit(RegExpNode* node) {
3076 if (node->info()->visited) return;
3077 node->info()->visited = true;
3078 node->Accept(this);
3079}
3080
3081
3082void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003083 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3084 Visit(on_failure);
3085}
3086
3087
3088class TableEntryBodyPrinter {
3089 public:
3090 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3091 : stream_(stream), choice_(choice) { }
3092 void Call(uc16 from, DispatchTable::Entry entry) {
3093 OutSet* out_set = entry.out_set();
3094 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3095 if (out_set->Get(i)) {
3096 stream()->Add(" n%p:s%io%i -> n%p;\n",
3097 choice(),
3098 from,
3099 i,
3100 choice()->alternatives()->at(i).node());
3101 }
3102 }
3103 }
3104 private:
3105 StringStream* stream() { return stream_; }
3106 ChoiceNode* choice() { return choice_; }
3107 StringStream* stream_;
3108 ChoiceNode* choice_;
3109};
3110
3111
3112class TableEntryHeaderPrinter {
3113 public:
3114 explicit TableEntryHeaderPrinter(StringStream* stream)
3115 : first_(true), stream_(stream) { }
3116 void Call(uc16 from, DispatchTable::Entry entry) {
3117 if (first_) {
3118 first_ = false;
3119 } else {
3120 stream()->Add("|");
3121 }
3122 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3123 OutSet* out_set = entry.out_set();
3124 int priority = 0;
3125 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3126 if (out_set->Get(i)) {
3127 if (priority > 0) stream()->Add("|");
3128 stream()->Add("<s%io%i> %i", from, i, priority);
3129 priority++;
3130 }
3131 }
3132 stream()->Add("}}");
3133 }
3134 private:
3135 bool first_;
3136 StringStream* stream() { return stream_; }
3137 StringStream* stream_;
3138};
3139
3140
3141class AttributePrinter {
3142 public:
3143 explicit AttributePrinter(DotPrinter* out)
3144 : out_(out), first_(true) { }
3145 void PrintSeparator() {
3146 if (first_) {
3147 first_ = false;
3148 } else {
3149 out_->stream()->Add("|");
3150 }
3151 }
3152 void PrintBit(const char* name, bool value) {
3153 if (!value) return;
3154 PrintSeparator();
3155 out_->stream()->Add("{%s}", name);
3156 }
3157 void PrintPositive(const char* name, int value) {
3158 if (value < 0) return;
3159 PrintSeparator();
3160 out_->stream()->Add("{%s|%x}", name, value);
3161 }
3162 private:
3163 DotPrinter* out_;
3164 bool first_;
3165};
3166
3167
3168void DotPrinter::PrintAttributes(RegExpNode* that) {
3169 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3170 "margin=0.1, fontsize=10, label=\"{",
3171 that);
3172 AttributePrinter printer(this);
3173 NodeInfo* info = that->info();
3174 printer.PrintBit("NI", info->follows_newline_interest);
3175 printer.PrintBit("WI", info->follows_word_interest);
3176 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003177 Label* label = that->label();
3178 if (label->is_bound())
3179 printer.PrintPositive("@", label->pos());
3180 stream()->Add("}\"];\n");
3181 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3182 "arrowhead=none];\n", that, that);
3183}
3184
3185
3186static const bool kPrintDispatchTable = false;
3187void DotPrinter::VisitChoice(ChoiceNode* that) {
3188 if (kPrintDispatchTable) {
3189 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3190 TableEntryHeaderPrinter header_printer(stream());
3191 that->GetTable(ignore_case_)->ForEach(&header_printer);
3192 stream()->Add("\"]\n", that);
3193 PrintAttributes(that);
3194 TableEntryBodyPrinter body_printer(stream(), that);
3195 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003196 } else {
3197 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3198 for (int i = 0; i < that->alternatives()->length(); i++) {
3199 GuardedAlternative alt = that->alternatives()->at(i);
3200 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3201 }
3202 }
3203 for (int i = 0; i < that->alternatives()->length(); i++) {
3204 GuardedAlternative alt = that->alternatives()->at(i);
3205 alt.node()->Accept(this);
3206 }
3207}
3208
3209
3210void DotPrinter::VisitText(TextNode* that) {
3211 stream()->Add(" n%p [label=\"", that);
3212 for (int i = 0; i < that->elements()->length(); i++) {
3213 if (i > 0) stream()->Add(" ");
3214 TextElement elm = that->elements()->at(i);
3215 switch (elm.type) {
3216 case TextElement::ATOM: {
3217 stream()->Add("'%w'", elm.data.u_atom->data());
3218 break;
3219 }
3220 case TextElement::CHAR_CLASS: {
3221 RegExpCharacterClass* node = elm.data.u_char_class;
3222 stream()->Add("[");
3223 if (node->is_negated())
3224 stream()->Add("^");
3225 for (int j = 0; j < node->ranges()->length(); j++) {
3226 CharacterRange range = node->ranges()->at(j);
3227 stream()->Add("%k-%k", range.from(), range.to());
3228 }
3229 stream()->Add("]");
3230 break;
3231 }
3232 default:
3233 UNREACHABLE();
3234 }
3235 }
3236 stream()->Add("\", shape=box, peripheries=2];\n");
3237 PrintAttributes(that);
3238 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3239 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003240}
3241
3242
3243void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3244 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3245 that,
3246 that->start_register(),
3247 that->end_register());
3248 PrintAttributes(that);
3249 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3250 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003251}
3252
3253
3254void DotPrinter::VisitEnd(EndNode* that) {
3255 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3256 PrintAttributes(that);
3257}
3258
3259
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003260void DotPrinter::VisitAssertion(AssertionNode* that) {
3261 stream()->Add(" n%p [", that);
3262 switch (that->type()) {
3263 case AssertionNode::AT_END:
3264 stream()->Add("label=\"$\", shape=septagon");
3265 break;
3266 case AssertionNode::AT_START:
3267 stream()->Add("label=\"^\", shape=septagon");
3268 break;
3269 case AssertionNode::AT_BOUNDARY:
3270 stream()->Add("label=\"\\b\", shape=septagon");
3271 break;
3272 case AssertionNode::AT_NON_BOUNDARY:
3273 stream()->Add("label=\"\\B\", shape=septagon");
3274 break;
3275 case AssertionNode::AFTER_NEWLINE:
3276 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3277 break;
3278 }
3279 stream()->Add("];\n");
3280 PrintAttributes(that);
3281 RegExpNode* successor = that->on_success();
3282 stream()->Add(" n%p -> n%p;\n", that, successor);
3283 Visit(successor);
3284}
3285
3286
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003287void DotPrinter::VisitAction(ActionNode* that) {
3288 stream()->Add(" n%p [", that);
3289 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003290 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003291 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3292 that->data_.u_store_register.reg,
3293 that->data_.u_store_register.value);
3294 break;
3295 case ActionNode::INCREMENT_REGISTER:
3296 stream()->Add("label=\"$%i++\", shape=octagon",
3297 that->data_.u_increment_register.reg);
3298 break;
3299 case ActionNode::STORE_POSITION:
3300 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3301 that->data_.u_position_register.reg);
3302 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003303 case ActionNode::BEGIN_SUBMATCH:
3304 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3305 that->data_.u_submatch.current_position_register);
3306 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003307 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003308 stream()->Add("label=\"escape\", shape=septagon");
3309 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003310 case ActionNode::EMPTY_MATCH_CHECK:
3311 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3312 that->data_.u_empty_match_check.start_register,
3313 that->data_.u_empty_match_check.repetition_register,
3314 that->data_.u_empty_match_check.repetition_limit);
3315 break;
3316 case ActionNode::CLEAR_CAPTURES: {
3317 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3318 that->data_.u_clear_captures.range_from,
3319 that->data_.u_clear_captures.range_to);
3320 break;
3321 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003322 }
3323 stream()->Add("];\n");
3324 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003325 RegExpNode* successor = that->on_success();
3326 stream()->Add(" n%p -> n%p;\n", that, successor);
3327 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003328}
3329
3330
3331class DispatchTableDumper {
3332 public:
3333 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3334 void Call(uc16 key, DispatchTable::Entry entry);
3335 StringStream* stream() { return stream_; }
3336 private:
3337 StringStream* stream_;
3338};
3339
3340
3341void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3342 stream()->Add("[%k-%k]: {", key, entry.to());
3343 OutSet* set = entry.out_set();
3344 bool first = true;
3345 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3346 if (set->Get(i)) {
3347 if (first) {
3348 first = false;
3349 } else {
3350 stream()->Add(", ");
3351 }
3352 stream()->Add("%i", i);
3353 }
3354 }
3355 stream()->Add("}\n");
3356}
3357
3358
3359void DispatchTable::Dump() {
3360 HeapStringAllocator alloc;
3361 StringStream stream(&alloc);
3362 DispatchTableDumper dumper(&stream);
3363 tree()->ForEach(&dumper);
3364 OS::PrintError("%s", *stream.ToCString());
3365}
3366
3367
3368void RegExpEngine::DotPrint(const char* label,
3369 RegExpNode* node,
3370 bool ignore_case) {
3371 DotPrinter printer(ignore_case);
3372 printer.PrintNode(label, node);
3373}
3374
3375
3376#endif // DEBUG
3377
3378
3379// -------------------------------------------------------------------
3380// Tree to graph conversion
3381
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003382static const int kSpaceRangeCount = 20;
3383static const int kSpaceRangeAsciiCount = 4;
3384static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3385 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3386 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3387
3388static const int kWordRangeCount = 8;
3389static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3390 '_', 'a', 'z' };
3391
3392static const int kDigitRangeCount = 2;
3393static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3394
3395static const int kLineTerminatorRangeCount = 6;
3396static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3397 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003398
3399RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003400 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003401 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3402 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003403 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003404}
3405
3406
3407RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003408 RegExpNode* on_success) {
3409 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003410}
3411
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003412static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3413 const uc16* special_class,
3414 int length) {
3415 ASSERT(ranges->length() != 0);
3416 ASSERT(length != 0);
3417 ASSERT(special_class[0] != 0);
3418 if (ranges->length() != (length >> 1) + 1) {
3419 return false;
3420 }
3421 CharacterRange range = ranges->at(0);
3422 if (range.from() != 0) {
3423 return false;
3424 }
3425 for (int i = 0; i < length; i += 2) {
3426 if (special_class[i] != (range.to() + 1)) {
3427 return false;
3428 }
3429 range = ranges->at((i >> 1) + 1);
3430 if (special_class[i+1] != range.from() - 1) {
3431 return false;
3432 }
3433 }
3434 if (range.to() != 0xffff) {
3435 return false;
3436 }
3437 return true;
3438}
3439
3440
3441static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3442 const uc16* special_class,
3443 int length) {
3444 if (ranges->length() * 2 != length) {
3445 return false;
3446 }
3447 for (int i = 0; i < length; i += 2) {
3448 CharacterRange range = ranges->at(i >> 1);
3449 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3450 return false;
3451 }
3452 }
3453 return true;
3454}
3455
3456
3457bool RegExpCharacterClass::is_standard() {
3458 // TODO(lrn): Remove need for this function, by not throwing away information
3459 // along the way.
3460 if (is_negated_) {
3461 return false;
3462 }
3463 if (set_.is_standard()) {
3464 return true;
3465 }
3466 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3467 set_.set_standard_set_type('s');
3468 return true;
3469 }
3470 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3471 set_.set_standard_set_type('S');
3472 return true;
3473 }
3474 if (CompareInverseRanges(set_.ranges(),
3475 kLineTerminatorRanges,
3476 kLineTerminatorRangeCount)) {
3477 set_.set_standard_set_type('.');
3478 return true;
3479 }
3480 return false;
3481}
3482
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003483
3484RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003485 RegExpNode* on_success) {
3486 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003487}
3488
3489
3490RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003491 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003492 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3493 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003494 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003495 for (int i = 0; i < length; i++) {
3496 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003497 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003498 result->AddAlternative(alternative);
3499 }
3500 return result;
3501}
3502
3503
3504RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003505 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003506 return ToNode(min(),
3507 max(),
3508 is_greedy(),
3509 body(),
3510 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003511 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003512}
3513
3514
3515RegExpNode* RegExpQuantifier::ToNode(int min,
3516 int max,
3517 bool is_greedy,
3518 RegExpTree* body,
3519 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00003520 RegExpNode* on_success,
3521 bool not_at_start) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003522 // x{f, t} becomes this:
3523 //
3524 // (r++)<-.
3525 // | `
3526 // | (x)
3527 // v ^
3528 // (r=0)-->(?)---/ [if r < t]
3529 // |
3530 // [if r >= f] \----> ...
3531 //
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003532
3533 // 15.10.2.5 RepeatMatcher algorithm.
3534 // The parser has already eliminated the case where max is 0. In the case
3535 // where max_match is zero the parser has removed the quantifier if min was
3536 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3537
3538 // If we know that we cannot match zero length then things are a little
3539 // simpler since we don't need to make the special zero length match check
3540 // from step 2.1. If the min and max are small we can unroll a little in
3541 // this case.
3542 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3543 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3544 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00003545 bool body_can_be_empty = (body->min_match() == 0);
3546 int body_start_reg = RegExpCompiler::kNoRegister;
3547 Interval capture_registers = body->CaptureRegisters();
3548 bool needs_capture_clearing = !capture_registers.is_empty();
3549 if (body_can_be_empty) {
3550 body_start_reg = compiler->AllocateRegister();
ager@chromium.org381abbb2009-02-25 13:23:22 +00003551 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
ager@chromium.org32912102009-01-16 10:38:43 +00003552 // Only unroll if there are no captures and the body can't be
3553 // empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003554 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3555 int new_max = (max == kInfinity) ? max : max - min;
3556 // Recurse once to get the loop or optional matches after the fixed ones.
iposva@chromium.org245aa852009-02-10 00:49:54 +00003557 RegExpNode* answer = ToNode(
3558 0, new_max, is_greedy, body, compiler, on_success, true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003559 // Unroll the forced matches from 0 to min. This can cause chains of
3560 // TextNodes (which the parser does not generate). These should be
3561 // combined if it turns out they hinder good code generation.
3562 for (int i = 0; i < min; i++) {
3563 answer = body->ToNode(compiler, answer);
3564 }
3565 return answer;
3566 }
3567 if (max <= kMaxUnrolledMaxMatches) {
3568 ASSERT(min == 0);
3569 // Unroll the optional matches up to max.
3570 RegExpNode* answer = on_success;
3571 for (int i = 0; i < max; i++) {
3572 ChoiceNode* alternation = new ChoiceNode(2);
3573 if (is_greedy) {
3574 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3575 answer)));
3576 alternation->AddAlternative(GuardedAlternative(on_success));
3577 } else {
3578 alternation->AddAlternative(GuardedAlternative(on_success));
3579 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3580 answer)));
3581 }
3582 answer = alternation;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003583 if (not_at_start) alternation->set_not_at_start();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003584 }
3585 return answer;
3586 }
3587 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003588 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003589 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003590 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00003591 int reg_ctr = needs_counter
3592 ? compiler->AllocateRegister()
3593 : RegExpCompiler::kNoRegister;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003594 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003595 if (not_at_start) center->set_not_at_start();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003596 RegExpNode* loop_return = needs_counter
3597 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3598 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00003599 if (body_can_be_empty) {
3600 // If the body can be empty we need to check if it was and then
3601 // backtrack.
3602 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3603 reg_ctr,
3604 min,
3605 loop_return);
3606 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003607 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00003608 if (body_can_be_empty) {
3609 // If the body can be empty we need to store the start position
3610 // so we can bail out if it was empty.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003611 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
ager@chromium.org32912102009-01-16 10:38:43 +00003612 }
3613 if (needs_capture_clearing) {
3614 // Before entering the body of this loop we need to clear captures.
3615 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3616 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003617 GuardedAlternative body_alt(body_node);
3618 if (has_max) {
3619 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3620 body_alt.AddGuard(body_guard);
3621 }
3622 GuardedAlternative rest_alt(on_success);
3623 if (has_min) {
3624 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3625 rest_alt.AddGuard(rest_guard);
3626 }
3627 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003628 center->AddLoopAlternative(body_alt);
3629 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003630 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003631 center->AddContinueAlternative(rest_alt);
3632 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003633 }
3634 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003635 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003636 } else {
3637 return center;
3638 }
3639}
3640
3641
3642RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003643 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003644 NodeInfo info;
3645 switch (type()) {
3646 case START_OF_LINE:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003647 return AssertionNode::AfterNewline(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003648 case START_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003649 return AssertionNode::AtStart(on_success);
3650 case BOUNDARY:
3651 return AssertionNode::AtBoundary(on_success);
3652 case NON_BOUNDARY:
3653 return AssertionNode::AtNonBoundary(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003654 case END_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003655 return AssertionNode::AtEnd(on_success);
3656 case END_OF_LINE: {
3657 // Compile $ in multiline regexps as an alternation with a positive
3658 // lookahead in one side and an end-of-input on the other side.
3659 // We need two registers for the lookahead.
3660 int stack_pointer_register = compiler->AllocateRegister();
3661 int position_register = compiler->AllocateRegister();
3662 // The ChoiceNode to distinguish between a newline and end-of-input.
3663 ChoiceNode* result = new ChoiceNode(2);
3664 // Create a newline atom.
3665 ZoneList<CharacterRange>* newline_ranges =
3666 new ZoneList<CharacterRange>(3);
3667 CharacterRange::AddClassEscape('n', newline_ranges);
3668 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
3669 TextNode* newline_matcher = new TextNode(
3670 newline_atom,
3671 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3672 position_register,
3673 0, // No captures inside.
3674 -1, // Ignored if no captures.
3675 on_success));
3676 // Create an end-of-input matcher.
3677 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
3678 stack_pointer_register,
3679 position_register,
3680 newline_matcher);
3681 // Add the two alternatives to the ChoiceNode.
3682 GuardedAlternative eol_alternative(end_of_line);
3683 result->AddAlternative(eol_alternative);
3684 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
3685 result->AddAlternative(end_alternative);
3686 return result;
3687 }
3688 default:
3689 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003690 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003691 return on_success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003692}
3693
3694
3695RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003696 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003697 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3698 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00003699 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003700}
3701
3702
3703RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003704 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003705 return on_success;
3706}
3707
3708
3709RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003710 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003711 int stack_pointer_register = compiler->AllocateRegister();
3712 int position_register = compiler->AllocateRegister();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003713
3714 const int registers_per_capture = 2;
3715 const int register_of_first_capture = 2;
3716 int register_count = capture_count_ * registers_per_capture;
3717 int register_start =
3718 register_of_first_capture + capture_from_ * registers_per_capture;
3719
ager@chromium.org8bb60582008-12-11 12:02:20 +00003720 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003721 if (is_positive()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003722 RegExpNode* node = ActionNode::BeginSubmatch(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003723 stack_pointer_register,
3724 position_register,
3725 body()->ToNode(
3726 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003727 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3728 position_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003729 register_count,
3730 register_start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003731 on_success)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003732 return node;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003733 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003734 // We use a ChoiceNode for a negative lookahead because it has most of
3735 // the characteristics we need. It has the body of the lookahead as its
3736 // first alternative and the expression after the lookahead of the second
3737 // alternative. If the first alternative succeeds then the
3738 // NegativeSubmatchSuccess will unwind the stack including everything the
3739 // choice node set up and backtrack. If the first alternative fails then
3740 // the second alternative is tried, which is exactly the desired result
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003741 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
3742 // ChoiceNode that knows to ignore the first exit when calculating quick
3743 // checks.
ager@chromium.org8bb60582008-12-11 12:02:20 +00003744 GuardedAlternative body_alt(
3745 body()->ToNode(
3746 compiler,
3747 success = new NegativeSubmatchSuccess(stack_pointer_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003748 position_register,
3749 register_count,
3750 register_start)));
3751 ChoiceNode* choice_node =
3752 new NegativeLookaheadChoiceNode(body_alt,
3753 GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003754 return ActionNode::BeginSubmatch(stack_pointer_register,
3755 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003756 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003757 }
3758}
3759
3760
3761RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003762 RegExpNode* on_success) {
3763 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003764}
3765
3766
3767RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3768 int index,
3769 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003770 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003771 int start_reg = RegExpCapture::StartRegister(index);
3772 int end_reg = RegExpCapture::EndRegister(index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003773 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003774 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003775 return ActionNode::StorePosition(start_reg, true, body_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003776}
3777
3778
3779RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003780 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003781 ZoneList<RegExpTree*>* children = nodes();
3782 RegExpNode* current = on_success;
3783 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003784 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003785 }
3786 return current;
3787}
3788
3789
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003790static void AddClass(const uc16* elmv,
3791 int elmc,
3792 ZoneList<CharacterRange>* ranges) {
3793 for (int i = 0; i < elmc; i += 2) {
3794 ASSERT(elmv[i] <= elmv[i + 1]);
3795 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3796 }
3797}
3798
3799
3800static void AddClassNegated(const uc16 *elmv,
3801 int elmc,
3802 ZoneList<CharacterRange>* ranges) {
3803 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003804 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003805 uc16 last = 0x0000;
3806 for (int i = 0; i < elmc; i += 2) {
3807 ASSERT(last <= elmv[i] - 1);
3808 ASSERT(elmv[i] <= elmv[i + 1]);
3809 ranges->Add(CharacterRange(last, elmv[i] - 1));
3810 last = elmv[i + 1] + 1;
3811 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003812 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003813}
3814
3815
3816void CharacterRange::AddClassEscape(uc16 type,
3817 ZoneList<CharacterRange>* ranges) {
3818 switch (type) {
3819 case 's':
3820 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3821 break;
3822 case 'S':
3823 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3824 break;
3825 case 'w':
3826 AddClass(kWordRanges, kWordRangeCount, ranges);
3827 break;
3828 case 'W':
3829 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3830 break;
3831 case 'd':
3832 AddClass(kDigitRanges, kDigitRangeCount, ranges);
3833 break;
3834 case 'D':
3835 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3836 break;
3837 case '.':
3838 AddClassNegated(kLineTerminatorRanges,
3839 kLineTerminatorRangeCount,
3840 ranges);
3841 break;
3842 // This is not a character range as defined by the spec but a
3843 // convenient shorthand for a character class that matches any
3844 // character.
3845 case '*':
3846 ranges->Add(CharacterRange::Everything());
3847 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003848 // This is the set of characters matched by the $ and ^ symbols
3849 // in multiline mode.
3850 case 'n':
3851 AddClass(kLineTerminatorRanges,
3852 kLineTerminatorRangeCount,
3853 ranges);
3854 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003855 default:
3856 UNREACHABLE();
3857 }
3858}
3859
3860
3861Vector<const uc16> CharacterRange::GetWordBounds() {
3862 return Vector<const uc16>(kWordRanges, kWordRangeCount);
3863}
3864
3865
3866class CharacterRangeSplitter {
3867 public:
3868 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
3869 ZoneList<CharacterRange>** excluded)
3870 : included_(included),
3871 excluded_(excluded) { }
3872 void Call(uc16 from, DispatchTable::Entry entry);
3873
3874 static const int kInBase = 0;
3875 static const int kInOverlay = 1;
3876
3877 private:
3878 ZoneList<CharacterRange>** included_;
3879 ZoneList<CharacterRange>** excluded_;
3880};
3881
3882
3883void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
3884 if (!entry.out_set()->Get(kInBase)) return;
3885 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
3886 ? included_
3887 : excluded_;
3888 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
3889 (*target)->Add(CharacterRange(entry.from(), entry.to()));
3890}
3891
3892
3893void CharacterRange::Split(ZoneList<CharacterRange>* base,
3894 Vector<const uc16> overlay,
3895 ZoneList<CharacterRange>** included,
3896 ZoneList<CharacterRange>** excluded) {
3897 ASSERT_EQ(NULL, *included);
3898 ASSERT_EQ(NULL, *excluded);
3899 DispatchTable table;
3900 for (int i = 0; i < base->length(); i++)
3901 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3902 for (int i = 0; i < overlay.length(); i += 2) {
3903 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3904 CharacterRangeSplitter::kInOverlay);
3905 }
3906 CharacterRangeSplitter callback(included, excluded);
3907 table.ForEach(&callback);
3908}
3909
3910
3911void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
3912 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3913 if (IsSingleton()) {
3914 // If this is a singleton we just expand the one character.
3915 int length = uncanonicalize.get(from(), '\0', chars);
3916 for (int i = 0; i < length; i++) {
3917 uc32 chr = chars[i];
3918 if (chr != from()) {
3919 ranges->Add(CharacterRange::Singleton(chars[i]));
3920 }
3921 }
3922 } else if (from() <= kRangeCanonicalizeMax &&
3923 to() <= kRangeCanonicalizeMax) {
3924 // If this is a range we expand the characters block by block,
3925 // expanding contiguous subranges (blocks) one at a time.
3926 // The approach is as follows. For a given start character we
3927 // look up the block that contains it, for instance 'a' if the
3928 // start character is 'c'. A block is characterized by the property
3929 // that all characters uncanonicalize in the same way as the first
3930 // element, except that each entry in the result is incremented
3931 // by the distance from the first element. So a-z is a block
3932 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3933 // uncanonicalizes to ['a' + k, 'A' + k].
3934 // Once we've found the start point we look up its uncanonicalization
3935 // and produce a range for each element. For instance for [c-f]
3936 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
3937 // add a range if it is not already contained in the input, so [c-f]
3938 // will be skipped but [C-F] will be added. If this range is not
3939 // completely contained in a block we do this for all the blocks
3940 // covered by the range.
3941 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3942 // First, look up the block that contains the 'from' character.
3943 int length = canonrange.get(from(), '\0', range);
3944 if (length == 0) {
3945 range[0] = from();
3946 } else {
3947 ASSERT_EQ(1, length);
3948 }
3949 int pos = from();
3950 // The start of the current block. Note that except for the first
3951 // iteration 'start' is always equal to 'pos'.
3952 int start;
3953 // If it is not the start point of a block the entry contains the
3954 // offset of the character from the start point.
3955 if ((range[0] & kStartMarker) == 0) {
3956 start = pos - range[0];
3957 } else {
3958 start = pos;
3959 }
3960 // Then we add the ranges on at a time, incrementing the current
3961 // position to be after the last block each time. The position
3962 // always points to the start of a block.
3963 while (pos < to()) {
3964 length = canonrange.get(start, '\0', range);
3965 if (length == 0) {
3966 range[0] = start;
3967 } else {
3968 ASSERT_EQ(1, length);
3969 }
3970 ASSERT((range[0] & kStartMarker) != 0);
3971 // The start point of a block contains the distance to the end
3972 // of the range.
3973 int block_end = start + (range[0] & kPayloadMask) - 1;
3974 int end = (block_end > to()) ? to() : block_end;
3975 length = uncanonicalize.get(start, '\0', range);
3976 for (int i = 0; i < length; i++) {
3977 uc32 c = range[i];
3978 uc16 range_from = c + (pos - start);
3979 uc16 range_to = c + (end - start);
3980 if (!(from() <= range_from && range_to <= to())) {
3981 ranges->Add(CharacterRange(range_from, range_to));
3982 }
3983 }
3984 start = pos = block_end + 1;
3985 }
3986 } else {
3987 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
3988 }
3989}
3990
3991
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003992ZoneList<CharacterRange>* CharacterSet::ranges() {
3993 if (ranges_ == NULL) {
3994 ranges_ = new ZoneList<CharacterRange>(2);
3995 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
3996 }
3997 return ranges_;
3998}
3999
4000
4001
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004002// -------------------------------------------------------------------
4003// Interest propagation
4004
4005
4006RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4007 for (int i = 0; i < siblings_.length(); i++) {
4008 RegExpNode* sibling = siblings_.Get(i);
4009 if (sibling->info()->Matches(info))
4010 return sibling;
4011 }
4012 return NULL;
4013}
4014
4015
4016RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4017 ASSERT_EQ(false, *cloned);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004018 siblings_.Ensure(this);
4019 RegExpNode* result = TryGetSibling(info);
4020 if (result != NULL) return result;
4021 result = this->Clone();
4022 NodeInfo* new_info = result->info();
4023 new_info->ResetCompilationState();
4024 new_info->AddFromPreceding(info);
4025 AddSibling(result);
4026 *cloned = true;
4027 return result;
4028}
4029
4030
4031template <class C>
4032static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4033 NodeInfo full_info(*node->info());
4034 full_info.AddFromPreceding(info);
4035 bool cloned = false;
4036 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4037}
4038
4039
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004040// -------------------------------------------------------------------
4041// Splay tree
4042
4043
4044OutSet* OutSet::Extend(unsigned value) {
4045 if (Get(value))
4046 return this;
4047 if (successors() != NULL) {
4048 for (int i = 0; i < successors()->length(); i++) {
4049 OutSet* successor = successors()->at(i);
4050 if (successor->Get(value))
4051 return successor;
4052 }
4053 } else {
4054 successors_ = new ZoneList<OutSet*>(2);
4055 }
4056 OutSet* result = new OutSet(first_, remaining_);
4057 result->Set(value);
4058 successors()->Add(result);
4059 return result;
4060}
4061
4062
4063void OutSet::Set(unsigned value) {
4064 if (value < kFirstLimit) {
4065 first_ |= (1 << value);
4066 } else {
4067 if (remaining_ == NULL)
4068 remaining_ = new ZoneList<unsigned>(1);
4069 if (remaining_->is_empty() || !remaining_->Contains(value))
4070 remaining_->Add(value);
4071 }
4072}
4073
4074
4075bool OutSet::Get(unsigned value) {
4076 if (value < kFirstLimit) {
4077 return (first_ & (1 << value)) != 0;
4078 } else if (remaining_ == NULL) {
4079 return false;
4080 } else {
4081 return remaining_->Contains(value);
4082 }
4083}
4084
4085
4086const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4087const DispatchTable::Entry DispatchTable::Config::kNoValue;
4088
4089
4090void DispatchTable::AddRange(CharacterRange full_range, int value) {
4091 CharacterRange current = full_range;
4092 if (tree()->is_empty()) {
4093 // If this is the first range we just insert into the table.
4094 ZoneSplayTree<Config>::Locator loc;
4095 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4096 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4097 return;
4098 }
4099 // First see if there is a range to the left of this one that
4100 // overlaps.
4101 ZoneSplayTree<Config>::Locator loc;
4102 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4103 Entry* entry = &loc.value();
4104 // If we've found a range that overlaps with this one, and it
4105 // starts strictly to the left of this one, we have to fix it
4106 // because the following code only handles ranges that start on
4107 // or after the start point of the range we're adding.
4108 if (entry->from() < current.from() && entry->to() >= current.from()) {
4109 // Snap the overlapping range in half around the start point of
4110 // the range we're adding.
4111 CharacterRange left(entry->from(), current.from() - 1);
4112 CharacterRange right(current.from(), entry->to());
4113 // The left part of the overlapping range doesn't overlap.
4114 // Truncate the whole entry to be just the left part.
4115 entry->set_to(left.to());
4116 // The right part is the one that overlaps. We add this part
4117 // to the map and let the next step deal with merging it with
4118 // the range we're adding.
4119 ZoneSplayTree<Config>::Locator loc;
4120 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4121 loc.set_value(Entry(right.from(),
4122 right.to(),
4123 entry->out_set()));
4124 }
4125 }
4126 while (current.is_valid()) {
4127 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4128 (loc.value().from() <= current.to()) &&
4129 (loc.value().to() >= current.from())) {
4130 Entry* entry = &loc.value();
4131 // We have overlap. If there is space between the start point of
4132 // the range we're adding and where the overlapping range starts
4133 // then we have to add a range covering just that space.
4134 if (current.from() < entry->from()) {
4135 ZoneSplayTree<Config>::Locator ins;
4136 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4137 ins.set_value(Entry(current.from(),
4138 entry->from() - 1,
4139 empty()->Extend(value)));
4140 current.set_from(entry->from());
4141 }
4142 ASSERT_EQ(current.from(), entry->from());
4143 // If the overlapping range extends beyond the one we want to add
4144 // we have to snap the right part off and add it separately.
4145 if (entry->to() > current.to()) {
4146 ZoneSplayTree<Config>::Locator ins;
4147 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4148 ins.set_value(Entry(current.to() + 1,
4149 entry->to(),
4150 entry->out_set()));
4151 entry->set_to(current.to());
4152 }
4153 ASSERT(entry->to() <= current.to());
4154 // The overlapping range is now completely contained by the range
4155 // we're adding so we can just update it and move the start point
4156 // of the range we're adding just past it.
4157 entry->AddValue(value);
4158 // Bail out if the last interval ended at 0xFFFF since otherwise
4159 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004160 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004161 break;
4162 ASSERT(entry->to() + 1 > current.from());
4163 current.set_from(entry->to() + 1);
4164 } else {
4165 // There is no overlap so we can just add the range
4166 ZoneSplayTree<Config>::Locator ins;
4167 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4168 ins.set_value(Entry(current.from(),
4169 current.to(),
4170 empty()->Extend(value)));
4171 break;
4172 }
4173 }
4174}
4175
4176
4177OutSet* DispatchTable::Get(uc16 value) {
4178 ZoneSplayTree<Config>::Locator loc;
4179 if (!tree()->FindGreatestLessThan(value, &loc))
4180 return empty();
4181 Entry* entry = &loc.value();
4182 if (value <= entry->to())
4183 return entry->out_set();
4184 else
4185 return empty();
4186}
4187
4188
4189// -------------------------------------------------------------------
4190// Analysis
4191
4192
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004193void Analysis::EnsureAnalyzed(RegExpNode* that) {
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004194 StackLimitCheck check;
4195 if (check.HasOverflowed()) {
4196 fail("Stack overflow");
4197 return;
4198 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004199 if (that->info()->been_analyzed || that->info()->being_analyzed)
4200 return;
4201 that->info()->being_analyzed = true;
4202 that->Accept(this);
4203 that->info()->being_analyzed = false;
4204 that->info()->been_analyzed = true;
4205}
4206
4207
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004208void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004209 // nothing to do
4210}
4211
4212
ager@chromium.org8bb60582008-12-11 12:02:20 +00004213void TextNode::CalculateOffsets() {
4214 int element_count = elements()->length();
4215 // Set up the offsets of the elements relative to the start. This is a fixed
4216 // quantity since a TextNode can only contain fixed-width things.
4217 int cp_offset = 0;
4218 for (int i = 0; i < element_count; i++) {
4219 TextElement& elm = elements()->at(i);
4220 elm.cp_offset = cp_offset;
4221 if (elm.type == TextElement::ATOM) {
4222 cp_offset += elm.data.u_atom->data().length();
4223 } else {
4224 cp_offset++;
4225 Vector<const uc16> quarks = elm.data.u_atom->data();
4226 }
4227 }
4228}
4229
4230
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004231void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004232 if (ignore_case_) {
4233 that->MakeCaseIndependent();
4234 }
4235 EnsureAnalyzed(that->on_success());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004236 if (!has_failed()) {
4237 that->CalculateOffsets();
4238 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004239}
4240
4241
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004242void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004243 RegExpNode* target = that->on_success();
4244 EnsureAnalyzed(target);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004245 if (!has_failed()) {
4246 // If the next node is interested in what it follows then this node
4247 // has to be interested too so it can pass the information on.
4248 that->info()->AddFromFollowing(target->info());
4249 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004250}
4251
4252
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004253void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004254 NodeInfo* info = that->info();
4255 for (int i = 0; i < that->alternatives()->length(); i++) {
4256 RegExpNode* node = that->alternatives()->at(i).node();
4257 EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004258 if (has_failed()) return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004259 // Anything the following nodes need to know has to be known by
4260 // this node also, so it can pass it on.
4261 info->AddFromFollowing(node->info());
4262 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004263}
4264
4265
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004266void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4267 NodeInfo* info = that->info();
4268 for (int i = 0; i < that->alternatives()->length(); i++) {
4269 RegExpNode* node = that->alternatives()->at(i).node();
4270 if (node != that->loop_node()) {
4271 EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004272 if (has_failed()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004273 info->AddFromFollowing(node->info());
4274 }
4275 }
4276 // Check the loop last since it may need the value of this node
4277 // to get a correct result.
4278 EnsureAnalyzed(that->loop_node());
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004279 if (!has_failed()) {
4280 info->AddFromFollowing(that->loop_node()->info());
4281 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004282}
4283
4284
4285void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004286 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004287}
4288
4289
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004290void Analysis::VisitAssertion(AssertionNode* that) {
4291 EnsureAnalyzed(that->on_success());
4292}
4293
4294
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004295// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004296// Dispatch table construction
4297
4298
4299void DispatchTableConstructor::VisitEnd(EndNode* that) {
4300 AddRange(CharacterRange::Everything());
4301}
4302
4303
4304void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4305 node->set_being_calculated(true);
4306 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4307 for (int i = 0; i < alternatives->length(); i++) {
4308 set_choice_index(i);
4309 alternatives->at(i).node()->Accept(this);
4310 }
4311 node->set_being_calculated(false);
4312}
4313
4314
4315class AddDispatchRange {
4316 public:
4317 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4318 : constructor_(constructor) { }
4319 void Call(uc32 from, DispatchTable::Entry entry);
4320 private:
4321 DispatchTableConstructor* constructor_;
4322};
4323
4324
4325void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4326 CharacterRange range(from, entry.to());
4327 constructor_->AddRange(range);
4328}
4329
4330
4331void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4332 if (node->being_calculated())
4333 return;
4334 DispatchTable* table = node->GetTable(ignore_case_);
4335 AddDispatchRange adder(this);
4336 table->ForEach(&adder);
4337}
4338
4339
4340void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4341 // TODO(160): Find the node that we refer back to and propagate its start
4342 // set back to here. For now we just accept anything.
4343 AddRange(CharacterRange::Everything());
4344}
4345
4346
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004347void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4348 RegExpNode* target = that->on_success();
4349 target->Accept(this);
4350}
4351
4352
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004353
4354static int CompareRangeByFrom(const CharacterRange* a,
4355 const CharacterRange* b) {
4356 return Compare<uc16>(a->from(), b->from());
4357}
4358
4359
4360void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4361 ranges->Sort(CompareRangeByFrom);
4362 uc16 last = 0;
4363 for (int i = 0; i < ranges->length(); i++) {
4364 CharacterRange range = ranges->at(i);
4365 if (last < range.from())
4366 AddRange(CharacterRange(last, range.from() - 1));
4367 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004368 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004369 return;
4370 } else {
4371 last = range.to() + 1;
4372 }
4373 }
4374 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004375 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004376}
4377
4378
4379void DispatchTableConstructor::VisitText(TextNode* that) {
4380 TextElement elm = that->elements()->at(0);
4381 switch (elm.type) {
4382 case TextElement::ATOM: {
4383 uc16 c = elm.data.u_atom->data()[0];
4384 AddRange(CharacterRange(c, c));
4385 break;
4386 }
4387 case TextElement::CHAR_CLASS: {
4388 RegExpCharacterClass* tree = elm.data.u_char_class;
4389 ZoneList<CharacterRange>* ranges = tree->ranges();
4390 if (tree->is_negated()) {
4391 AddInverse(ranges);
4392 } else {
4393 for (int i = 0; i < ranges->length(); i++)
4394 AddRange(ranges->at(i));
4395 }
4396 break;
4397 }
4398 default: {
4399 UNIMPLEMENTED();
4400 }
4401 }
4402}
4403
4404
4405void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004406 RegExpNode* target = that->on_success();
4407 target->Accept(this);
4408}
4409
4410
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00004411RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
4412 bool ignore_case,
4413 bool is_multiline,
4414 Handle<String> pattern,
4415 bool is_ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004416 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00004417 return IrregexpRegExpTooBig();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004418 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004419 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004420 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004421 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004422 0,
4423 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004424 compiler.accept());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004425 RegExpNode* node = captured_body;
4426 if (!data->tree->IsAnchored()) {
4427 // Add a .*? at the beginning, outside the body capture, unless
4428 // this expression is anchored at the beginning.
iposva@chromium.org245aa852009-02-10 00:49:54 +00004429 RegExpNode* loop_node =
4430 RegExpQuantifier::ToNode(0,
4431 RegExpTree::kInfinity,
4432 false,
4433 new RegExpCharacterClass('*'),
4434 &compiler,
4435 captured_body,
4436 data->contains_anchor);
4437
4438 if (data->contains_anchor) {
4439 // Unroll loop once, to take care of the case that might start
4440 // at the start of input.
4441 ChoiceNode* first_step_node = new ChoiceNode(2);
4442 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4443 first_step_node->AddAlternative(GuardedAlternative(
4444 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4445 node = first_step_node;
4446 } else {
4447 node = loop_node;
4448 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004449 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004450 data->node = node;
4451 Analysis analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004452 analysis.EnsureAnalyzed(node);
sgjesse@chromium.org755c5b12009-05-29 11:04:38 +00004453 if (analysis.has_failed()) {
4454 const char* error_message = analysis.error_message();
4455 return CompilationResult(error_message);
4456 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004457
4458 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00004459
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004460#ifdef V8_NATIVE_REGEXP
ager@chromium.org9085a012009-05-11 19:22:57 +00004461#ifdef V8_TARGET_ARCH_ARM
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004462 // ARM native regexp not implemented yet.
4463 UNREACHABLE();
ager@chromium.org5ec48922009-05-05 07:25:34 +00004464#endif
ager@chromium.org9085a012009-05-11 19:22:57 +00004465#ifdef V8_TARGET_ARCH_X64
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004466 // X64 native regexp not implemented yet.
4467 UNREACHABLE();
ager@chromium.org5ec48922009-05-05 07:25:34 +00004468#endif
ager@chromium.org9085a012009-05-11 19:22:57 +00004469#ifdef V8_TARGET_ARCH_IA32
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004470 RegExpMacroAssemblerIA32::Mode mode;
4471 if (is_ascii) {
4472 mode = RegExpMacroAssemblerIA32::ASCII;
4473 } else {
4474 mode = RegExpMacroAssemblerIA32::UC16;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004475 }
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004476 RegExpMacroAssemblerIA32 macro_assembler(mode,
4477 (data->capture_count + 1) * 2);
4478 return compiler.Assemble(&macro_assembler,
4479 node,
4480 data->capture_count,
4481 pattern);
4482#endif
4483#else // ! V8_NATIVE_REGEXP
4484 // Interpreted regexp.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004485 EmbeddedVector<byte, 1024> codes;
4486 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4487 return compiler.Assemble(&macro_assembler,
4488 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004489 data->capture_count,
4490 pattern);
kasperl@chromium.org68ac0092009-07-09 06:00:35 +00004491#endif // V8_NATIVE_REGEXP
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004492}
4493
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00004494}} // namespace v8::internal