blob: 5589b77f21f5f5a11f4ea7fd3a715857b396b2a0 [file] [log] [blame]
ager@chromium.org381abbb2009-02-25 13:23:22 +00001// Copyright 2006-2009 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
ager@chromium.orga74f0da2008-12-03 16:05:52 +000030#include "ast.h"
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000031#include "compiler.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000032#include "execution.h"
33#include "factory.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000034#include "jsregexp-inl.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000035#include "platform.h"
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000036#include "runtime.h"
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000037#include "top.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000038#include "compilation-cache.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000039#include "string-stream.h"
40#include "parser.h"
41#include "regexp-macro-assembler.h"
42#include "regexp-macro-assembler-tracer.h"
43#include "regexp-macro-assembler-irregexp.h"
ager@chromium.org32912102009-01-16 10:38:43 +000044#include "regexp-stack.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000045
ager@chromium.org9085a012009-05-11 19:22:57 +000046#ifdef V8_TARGET_ARCH_IA32
ager@chromium.org3a37e9b2009-04-27 09:26:21 +000047#include "ia32/macro-assembler-ia32.h"
48#include "ia32/regexp-macro-assembler-ia32.h"
ager@chromium.org9085a012009-05-11 19:22:57 +000049#elif V8_TARGET_ARCH_X64
50#include "x64/macro-assembler-x64.h"
51#include "x64/regexp-macro-assembler-x64.h"
52#elif V8_TARGET_ARCH_ARM
53#include "arm/regexp-macro-assembler-arm.h"
ager@chromium.orga74f0da2008-12-03 16:05:52 +000054#endif
55
56#include "interpreter-irregexp.h"
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000057
ager@chromium.orga74f0da2008-12-03 16:05:52 +000058
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000059namespace v8 { namespace internal {
60
61
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000062Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
63 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000064 Handle<String> flags,
65 bool* has_pending_exception) {
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000066 // Ensure that the constructor function has been loaded.
67 if (!constructor->IsLoaded()) {
68 LoadLazy(constructor, has_pending_exception);
ager@chromium.orga74f0da2008-12-03 16:05:52 +000069 if (*has_pending_exception) return Handle<Object>();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000070 }
71 // Call the construct code with 2 arguments.
72 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
73 Handle<Object>::cast(flags).location() };
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000074 return Execution::New(constructor, 2, argv, has_pending_exception);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000075}
76
77
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000078static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
79 int flags = JSRegExp::NONE;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +000080 for (int i = 0; i < str->length(); i++) {
81 switch (str->Get(i)) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000082 case 'i':
83 flags |= JSRegExp::IGNORE_CASE;
84 break;
85 case 'g':
86 flags |= JSRegExp::GLOBAL;
87 break;
88 case 'm':
89 flags |= JSRegExp::MULTILINE;
90 break;
91 }
92 }
93 return JSRegExp::Flags(flags);
94}
95
96
ager@chromium.orga74f0da2008-12-03 16:05:52 +000097static inline void ThrowRegExpException(Handle<JSRegExp> re,
98 Handle<String> pattern,
99 Handle<String> error_text,
100 const char* message) {
101 Handle<JSArray> array = Factory::NewJSArray(2);
102 SetElement(array, 0, pattern);
103 SetElement(array, 1, error_text);
104 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
105 Top::Throw(*regexp_err);
106}
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000107
108
ager@chromium.org8bb60582008-12-11 12:02:20 +0000109// Generic RegExp methods. Dispatches to implementation specific methods.
110
111
112class OffsetsVector {
113 public:
114 inline OffsetsVector(int num_registers)
115 : offsets_vector_length_(num_registers) {
116 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
117 vector_ = NewArray<int>(offsets_vector_length_);
118 } else {
119 vector_ = static_offsets_vector_;
120 }
121 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000122 inline ~OffsetsVector() {
123 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
124 DeleteArray(vector_);
125 vector_ = NULL;
126 }
127 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000128 inline int* vector() { return vector_; }
129 inline int length() { return offsets_vector_length_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000130
131 private:
132 int* vector_;
133 int offsets_vector_length_;
134 static const int kStaticOffsetsVectorSize = 50;
135 static int static_offsets_vector_[kStaticOffsetsVectorSize];
136};
137
138
139int OffsetsVector::static_offsets_vector_[
140 OffsetsVector::kStaticOffsetsVectorSize];
141
142
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000143Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
144 Handle<String> pattern,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000145 Handle<String> flag_str) {
146 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
147 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
148 bool in_cache = !cached.is_null();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000149 LOG(RegExpCompileEvent(re, in_cache));
150
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000151 Handle<Object> result;
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000152 if (in_cache) {
153 re->set_data(*cached);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000154 return re;
155 }
156 FlattenString(pattern);
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000157 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000158 RegExpCompileData parse_result;
159 FlatStringReader reader(pattern);
160 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
161 // Throw an exception if we fail to parse the pattern.
162 ThrowRegExpException(re,
163 pattern,
164 parse_result.error,
165 "malformed_regexp");
166 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000167 }
168
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000169 if (parse_result.simple && !flags.is_ignore_case()) {
170 // Parse-tree is a single atom that is equal to the pattern.
171 AtomCompile(re, pattern, flags, pattern);
172 } else if (parse_result.tree->IsAtom() &&
173 !flags.is_ignore_case() &&
174 parse_result.capture_count == 0) {
175 RegExpAtom* atom = parse_result.tree->AsAtom();
176 Vector<const uc16> atom_pattern = atom->data();
177 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
178 AtomCompile(re, pattern, flags, atom_string);
179 } else {
180 IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
181 }
182 ASSERT(re->data()->IsFixedArray());
183 // Compilation succeeded so the data is set on the regexp
184 // and we can store it in the cache.
185 Handle<FixedArray> data(FixedArray::cast(re->data()));
186 CompilationCache::PutRegExp(pattern, flags, data);
187
188 return re;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000189}
190
191
192Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
193 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000194 int index,
195 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000196 switch (regexp->TypeTag()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000197 case JSRegExp::ATOM:
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000198 return AtomExec(regexp, subject, index, last_match_info);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000199 case JSRegExp::IRREGEXP: {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000200 Handle<Object> result =
201 IrregexpExec(regexp, subject, index, last_match_info);
ager@chromium.org6f10e412009-02-13 10:11:16 +0000202 ASSERT(!result.is_null() || Top::has_pending_exception());
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000203 return result;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000204 }
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000205 default:
206 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000207 return Handle<Object>::null();
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000208 }
209}
210
211
ager@chromium.org8bb60582008-12-11 12:02:20 +0000212// RegExp Atom implementation: Simple string search using indexOf.
213
214
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000215void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
216 Handle<String> pattern,
217 JSRegExp::Flags flags,
218 Handle<String> match_pattern) {
219 Factory::SetRegExpAtomData(re,
220 JSRegExp::ATOM,
221 pattern,
222 flags,
223 match_pattern);
224}
225
226
227static void SetAtomLastCapture(FixedArray* array,
228 String* subject,
229 int from,
230 int to) {
231 NoHandleAllocation no_handles;
232 RegExpImpl::SetLastCaptureCount(array, 2);
233 RegExpImpl::SetLastSubject(array, subject);
234 RegExpImpl::SetLastInput(array, subject);
235 RegExpImpl::SetCapture(array, 0, from);
236 RegExpImpl::SetCapture(array, 1, to);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000237}
238
239
240Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
241 Handle<String> subject,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000242 int index,
243 Handle<JSArray> last_match_info) {
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +0000244 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000245
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000246 uint32_t start_index = index;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000247
ager@chromium.org7c537e22008-10-16 08:43:32 +0000248 int value = Runtime::StringMatch(subject, needle, start_index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000249 if (value == -1) return Factory::null_value();
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000250 ASSERT(last_match_info->HasFastElements());
ager@chromium.org7c537e22008-10-16 08:43:32 +0000251
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000252 {
253 NoHandleAllocation no_handles;
254 FixedArray* array = last_match_info->elements();
255 SetAtomLastCapture(array, *subject, value, value + needle->length());
256 }
257 return last_match_info;
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000258}
259
260
ager@chromium.org8bb60582008-12-11 12:02:20 +0000261// Irregexp implementation.
262
263
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000264// Ensures that the regexp object contains a compiled version of the
265// source for either ASCII or non-ASCII strings.
266// If the compiled version doesn't already exist, it is compiled
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000267// from the source pattern.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000268// If compilation fails, an exception is thrown and this function
269// returns false.
ager@chromium.org41826e72009-03-30 13:30:57 +0000270bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000271 int index;
272 if (is_ascii) {
273 index = JSRegExp::kIrregexpASCIICodeIndex;
274 } else {
275 index = JSRegExp::kIrregexpUC16CodeIndex;
276 }
277 Object* entry = re->DataAt(index);
278 if (!entry->IsTheHole()) {
279 // A value has already been compiled.
280 if (entry->IsJSObject()) {
281 // If it's a JS value, it's an error.
282 Top::Throw(entry);
283 return false;
284 }
285 return true;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000286 }
287
288 // Compile the RegExp.
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000289 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000290
291 JSRegExp::Flags flags = re->GetFlags();
292
293 Handle<String> pattern(re->Pattern());
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000294 if (!pattern->IsFlat()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000295 FlattenString(pattern);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000296 }
297
298 RegExpCompileData compile_data;
299 FlatStringReader reader(pattern);
300 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
301 // Throw an exception if we fail to parse the pattern.
302 // THIS SHOULD NOT HAPPEN. We already parsed it successfully once.
303 ThrowRegExpException(re,
304 pattern,
305 compile_data.error,
306 "malformed_regexp");
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000307 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000308 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000309 RegExpEngine::CompilationResult result =
ager@chromium.org8bb60582008-12-11 12:02:20 +0000310 RegExpEngine::Compile(&compile_data,
311 flags.is_ignore_case(),
312 flags.is_multiline(),
313 pattern,
314 is_ascii);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000315 if (result.error_message != NULL) {
316 // Unable to compile regexp.
317 Handle<JSArray> array = Factory::NewJSArray(2);
318 SetElement(array, 0, pattern);
319 SetElement(array,
320 1,
321 Factory::NewStringFromUtf8(CStrVector(result.error_message)));
322 Handle<Object> regexp_err =
323 Factory::NewSyntaxError("malformed_regexp", array);
324 Top::Throw(*regexp_err);
325 re->SetDataAt(index, *regexp_err);
326 return false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000327 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000328
329 NoHandleAllocation no_handles;
330
331 FixedArray* data = FixedArray::cast(re->data());
332 data->set(index, result.code);
333 int register_max = IrregexpMaxRegisterCount(data);
334 if (result.num_registers > register_max) {
335 SetIrregexpMaxRegisterCount(data, result.num_registers);
336 }
337
338 return true;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000339}
340
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000341
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000342int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
343 return Smi::cast(
344 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000345}
346
347
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000348void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
349 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000350}
351
352
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000353int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
354 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000355}
356
357
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000358int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
359 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000360}
361
362
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000363ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
364 int index;
365 if (is_ascii) {
366 index = JSRegExp::kIrregexpASCIICodeIndex;
367 } else {
368 index = JSRegExp::kIrregexpUC16CodeIndex;
369 }
370 return ByteArray::cast(re->get(index));
371}
372
373
374Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
375 int index;
376 if (is_ascii) {
377 index = JSRegExp::kIrregexpASCIICodeIndex;
378 } else {
379 index = JSRegExp::kIrregexpUC16CodeIndex;
380 }
381 return Code::cast(re->get(index));
382}
383
384
385void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
386 Handle<String> pattern,
387 JSRegExp::Flags flags,
388 int capture_count) {
389 // Initialize compiled code entries to null.
390 Factory::SetRegExpIrregexpData(re,
391 JSRegExp::IRREGEXP,
392 pattern,
393 flags,
394 capture_count);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000395}
396
397
ager@chromium.org41826e72009-03-30 13:30:57 +0000398Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000399 Handle<String> subject,
ager@chromium.org41826e72009-03-30 13:30:57 +0000400 int previous_index,
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000401 Handle<JSArray> last_match_info) {
ager@chromium.org41826e72009-03-30 13:30:57 +0000402 ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000403
ager@chromium.org8bb60582008-12-11 12:02:20 +0000404 // Prepare space for the return values.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000405 int number_of_capture_registers =
ager@chromium.org41826e72009-03-30 13:30:57 +0000406 (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000407 OffsetsVector offsets(number_of_capture_registers);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000408
ager@chromium.org8bb60582008-12-11 12:02:20 +0000409#ifdef DEBUG
410 if (FLAG_trace_regexp_bytecodes) {
ager@chromium.org41826e72009-03-30 13:30:57 +0000411 String* pattern = jsregexp->Pattern();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000412 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
413 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
414 }
415#endif
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000416
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000417 if (!subject->IsFlat()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000418 FlattenString(subject);
419 }
420
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000421 last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
422
ager@chromium.org41826e72009-03-30 13:30:57 +0000423 int* offsets_vector = offsets.vector();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000424 bool rc;
425
ager@chromium.org41826e72009-03-30 13:30:57 +0000426 // Dispatch to the correct RegExp implementation.
427
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000428 Handle<String> original_subject = subject;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000429 Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
430 if (UseNativeRegexp()) {
ager@chromium.org9085a012009-05-11 19:22:57 +0000431#if V8_TARGET_ARCH_IA32
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000432 RegExpMacroAssemblerIA32::Result res;
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000433 do {
ager@chromium.org5ec48922009-05-05 07:25:34 +0000434 bool is_ascii = subject->IsAsciiRepresentation();
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000435 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
436 return Handle<Object>::null();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000437 }
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000438 Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
439 res = RegExpMacroAssemblerIA32::Match(code,
440 subject,
441 offsets_vector,
ager@chromium.org41826e72009-03-30 13:30:57 +0000442 offsets.length(),
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000443 previous_index);
444 // If result is RETRY, the string have changed representation, and we
445 // must restart from scratch.
446 } while (res == RegExpMacroAssemblerIA32::RETRY);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000447 if (res == RegExpMacroAssemblerIA32::EXCEPTION) {
448 ASSERT(Top::has_pending_exception());
449 return Handle<Object>::null();
450 }
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000451 ASSERT(res == RegExpMacroAssemblerIA32::SUCCESS
452 || res == RegExpMacroAssemblerIA32::FAILURE);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000453
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000454 rc = (res == RegExpMacroAssemblerIA32::SUCCESS);
ager@chromium.org9085a012009-05-11 19:22:57 +0000455#else
456 UNREACHABLE();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000457#endif
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000458 } else {
ager@chromium.org5ec48922009-05-05 07:25:34 +0000459 bool is_ascii = subject->IsAsciiRepresentation();
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000460 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
461 return Handle<Object>::null();
462 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000463 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
464 offsets_vector[i] = -1;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000465 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000466 Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
ager@chromium.org8bb60582008-12-11 12:02:20 +0000467
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000468 rc = IrregexpInterpreter::Match(byte_codes,
469 subject,
470 offsets_vector,
471 previous_index);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000472 }
473
ager@chromium.org41826e72009-03-30 13:30:57 +0000474 // Handle results from RegExp implementation.
475
ager@chromium.org8bb60582008-12-11 12:02:20 +0000476 if (!rc) {
477 return Factory::null_value();
478 }
479
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000480 FixedArray* array = last_match_info->elements();
481 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000482 // The captures come in (start, end+1) pairs.
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000483 SetLastCaptureCount(array, number_of_capture_registers);
484 SetLastSubject(array, *original_subject);
485 SetLastInput(array, *original_subject);
ager@chromium.orgbb29dc92009-03-24 13:25:23 +0000486 for (int i = 0; i < number_of_capture_registers; i+=2) {
487 SetCapture(array, i, offsets_vector[i]);
488 SetCapture(array, i + 1, offsets_vector[i + 1]);
489 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000490 return last_match_info;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000491}
492
493
494// -------------------------------------------------------------------
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000495// Implementation of the Irregexp regular expression engine.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000496//
497// The Irregexp regular expression engine is intended to be a complete
498// implementation of ECMAScript regular expressions. It generates either
499// bytecodes or native code.
500
501// The Irregexp regexp engine is structured in three steps.
502// 1) The parser generates an abstract syntax tree. See ast.cc.
503// 2) From the AST a node network is created. The nodes are all
504// subclasses of RegExpNode. The nodes represent states when
505// executing a regular expression. Several optimizations are
506// performed on the node network.
507// 3) From the nodes we generate either byte codes or native code
508// that can actually execute the regular expression (perform
509// the search). The code generation step is described in more
510// detail below.
511
512// Code generation.
513//
514// The nodes are divided into four main categories.
515// * Choice nodes
516// These represent places where the regular expression can
517// match in more than one way. For example on entry to an
518// alternation (foo|bar) or a repetition (*, +, ? or {}).
519// * Action nodes
520// These represent places where some action should be
521// performed. Examples include recording the current position
522// in the input string to a register (in order to implement
523// captures) or other actions on register for example in order
524// to implement the counters needed for {} repetitions.
525// * Matching nodes
526// These attempt to match some element part of the input string.
527// Examples of elements include character classes, plain strings
528// or back references.
529// * End nodes
530// These are used to implement the actions required on finding
531// a successful match or failing to find a match.
532//
533// The code generated (whether as byte codes or native code) maintains
534// some state as it runs. This consists of the following elements:
535//
536// * The capture registers. Used for string captures.
537// * Other registers. Used for counters etc.
538// * The current position.
539// * The stack of backtracking information. Used when a matching node
540// fails to find a match and needs to try an alternative.
541//
542// Conceptual regular expression execution model:
543//
544// There is a simple conceptual model of regular expression execution
545// which will be presented first. The actual code generated is a more
546// efficient simulation of the simple conceptual model:
547//
548// * Choice nodes are implemented as follows:
549// For each choice except the last {
550// push current position
551// push backtrack code location
552// <generate code to test for choice>
553// backtrack code location:
554// pop current position
555// }
556// <generate code to test for last choice>
557//
558// * Actions nodes are generated as follows
559// <push affected registers on backtrack stack>
560// <generate code to perform action>
561// push backtrack code location
562// <generate code to test for following nodes>
563// backtrack code location:
564// <pop affected registers to restore their state>
565// <pop backtrack location from stack and go to it>
566//
567// * Matching nodes are generated as follows:
568// if input string matches at current position
569// update current position
570// <generate code to test for following nodes>
571// else
572// <pop backtrack location from stack and go to it>
573//
574// Thus it can be seen that the current position is saved and restored
575// by the choice nodes, whereas the registers are saved and restored by
576// by the action nodes that manipulate them.
577//
578// The other interesting aspect of this model is that nodes are generated
579// at the point where they are needed by a recursive call to Emit(). If
580// the node has already been code generated then the Emit() call will
581// generate a jump to the previously generated code instead. In order to
582// limit recursion it is possible for the Emit() function to put the node
583// on a work list for later generation and instead generate a jump. The
584// destination of the jump is resolved later when the code is generated.
585//
586// Actual regular expression code generation.
587//
588// Code generation is actually more complicated than the above. In order
589// to improve the efficiency of the generated code some optimizations are
590// performed
591//
592// * Choice nodes have 1-character lookahead.
593// A choice node looks at the following character and eliminates some of
594// the choices immediately based on that character. This is not yet
595// implemented.
596// * Simple greedy loops store reduced backtracking information.
597// A quantifier like /.*foo/m will greedily match the whole input. It will
598// then need to backtrack to a point where it can match "foo". The naive
599// implementation of this would push each character position onto the
600// backtracking stack, then pop them off one by one. This would use space
601// proportional to the length of the input string. However since the "."
602// can only match in one way and always has a constant length (in this case
603// of 1) it suffices to store the current position on the top of the stack
604// once. Matching now becomes merely incrementing the current position and
605// backtracking becomes decrementing the current position and checking the
606// result against the stored current position. This is faster and saves
607// space.
608// * The current state is virtualized.
609// This is used to defer expensive operations until it is clear that they
610// are needed and to generate code for a node more than once, allowing
611// specialized an efficient versions of the code to be created. This is
612// explained in the section below.
613//
614// Execution state virtualization.
615//
616// Instead of emitting code, nodes that manipulate the state can record their
ager@chromium.org32912102009-01-16 10:38:43 +0000617// manipulation in an object called the Trace. The Trace object can record a
618// current position offset, an optional backtrack code location on the top of
619// the virtualized backtrack stack and some register changes. When a node is
620// to be emitted it can flush the Trace or update it. Flushing the Trace
ager@chromium.org8bb60582008-12-11 12:02:20 +0000621// will emit code to bring the actual state into line with the virtual state.
622// Avoiding flushing the state can postpone some work (eg updates of capture
623// registers). Postponing work can save time when executing the regular
624// expression since it may be found that the work never has to be done as a
625// failure to match can occur. In addition it is much faster to jump to a
626// known backtrack code location than it is to pop an unknown backtrack
627// location from the stack and jump there.
628//
ager@chromium.org32912102009-01-16 10:38:43 +0000629// The virtual state found in the Trace affects code generation. For example
630// the virtual state contains the difference between the actual current
631// position and the virtual current position, and matching code needs to use
632// this offset to attempt a match in the correct location of the input
633// string. Therefore code generated for a non-trivial trace is specialized
634// to that trace. The code generator therefore has the ability to generate
635// code for each node several times. In order to limit the size of the
636// generated code there is an arbitrary limit on how many specialized sets of
637// code may be generated for a given node. If the limit is reached, the
638// trace is flushed and a generic version of the code for a node is emitted.
639// This is subsequently used for that node. The code emitted for non-generic
640// trace is not recorded in the node and so it cannot currently be reused in
641// the event that code generation is requested for an identical trace.
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000642
643
644void RegExpTree::AppendToText(RegExpText* text) {
645 UNREACHABLE();
646}
647
648
649void RegExpAtom::AppendToText(RegExpText* text) {
650 text->AddElement(TextElement::Atom(this));
651}
652
653
654void RegExpCharacterClass::AppendToText(RegExpText* text) {
655 text->AddElement(TextElement::CharClass(this));
656}
657
658
659void RegExpText::AppendToText(RegExpText* text) {
660 for (int i = 0; i < elements()->length(); i++)
661 text->AddElement(elements()->at(i));
662}
663
664
665TextElement TextElement::Atom(RegExpAtom* atom) {
666 TextElement result = TextElement(ATOM);
667 result.data.u_atom = atom;
668 return result;
669}
670
671
672TextElement TextElement::CharClass(
673 RegExpCharacterClass* char_class) {
674 TextElement result = TextElement(CHAR_CLASS);
675 result.data.u_char_class = char_class;
676 return result;
677}
678
679
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000680int TextElement::length() {
681 if (type == ATOM) {
682 return data.u_atom->length();
683 } else {
684 ASSERT(type == CHAR_CLASS);
685 return 1;
686 }
687}
688
689
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000690DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
691 if (table_ == NULL) {
692 table_ = new DispatchTable();
693 DispatchTableConstructor cons(table_, ignore_case);
694 cons.BuildTable(this);
695 }
696 return table_;
697}
698
699
700class RegExpCompiler {
701 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +0000702 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000703
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000704 int AllocateRegister() {
705 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
706 reg_exp_too_big_ = true;
707 return next_register_;
708 }
709 return next_register_++;
710 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000711
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000712 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
713 RegExpNode* start,
714 int capture_count,
715 Handle<String> pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000716
717 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
718
719 static const int kImplementationOffset = 0;
720 static const int kNumberOfRegistersOffset = 0;
721 static const int kCodeOffset = 1;
722
723 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
724 EndNode* accept() { return accept_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000725
726 static const int kMaxRecursion = 100;
727 inline int recursion_depth() { return recursion_depth_; }
728 inline void IncrementRecursionDepth() { recursion_depth_++; }
729 inline void DecrementRecursionDepth() { recursion_depth_--; }
730
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000731 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
732
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000733 inline bool ignore_case() { return ignore_case_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000734 inline bool ascii() { return ascii_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000735
ager@chromium.org32912102009-01-16 10:38:43 +0000736 static const int kNoRegister = -1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000737 private:
738 EndNode* accept_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000739 int next_register_;
740 List<RegExpNode*>* work_list_;
741 int recursion_depth_;
742 RegExpMacroAssembler* macro_assembler_;
743 bool ignore_case_;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000744 bool ascii_;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000745 bool reg_exp_too_big_;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000746};
747
748
749class RecursionCheck {
750 public:
751 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
752 compiler->IncrementRecursionDepth();
753 }
754 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
755 private:
756 RegExpCompiler* compiler_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000757};
758
759
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000760static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
761 return RegExpEngine::CompilationResult("RegExp too big");
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000762}
763
764
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000765// Attempts to compile the regexp using an Irregexp code generator. Returns
766// a fixed array or a null handle depending on whether it succeeded.
ager@chromium.org8bb60582008-12-11 12:02:20 +0000767RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000768 : next_register_(2 * (capture_count + 1)),
769 work_list_(NULL),
770 recursion_depth_(0),
ager@chromium.org8bb60582008-12-11 12:02:20 +0000771 ignore_case_(ignore_case),
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000772 ascii_(ascii),
773 reg_exp_too_big_(false) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000774 accept_ = new EndNode(EndNode::ACCEPT);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000775 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000776}
777
778
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000779RegExpEngine::CompilationResult RegExpCompiler::Assemble(
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000780 RegExpMacroAssembler* macro_assembler,
781 RegExpNode* start,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000782 int capture_count,
783 Handle<String> pattern) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000784#ifdef DEBUG
785 if (FLAG_trace_regexp_assembler)
786 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
787 else
788#endif
789 macro_assembler_ = macro_assembler;
790 List <RegExpNode*> work_list(0);
791 work_list_ = &work_list;
792 Label fail;
iposva@chromium.org245aa852009-02-10 00:49:54 +0000793 macro_assembler_->PushBacktrack(&fail);
ager@chromium.org32912102009-01-16 10:38:43 +0000794 Trace new_trace;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000795 start->Emit(this, &new_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000796 macro_assembler_->Bind(&fail);
797 macro_assembler_->Fail();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000798 while (!work_list.is_empty()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000799 work_list.RemoveLast()->Emit(this, &new_trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000800 }
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000801 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
802
ager@chromium.org8bb60582008-12-11 12:02:20 +0000803 Handle<Object> code = macro_assembler_->GetCode(pattern);
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000804
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000805 work_list_ = NULL;
806#ifdef DEBUG
807 if (FLAG_trace_regexp_assembler) {
808 delete macro_assembler_;
809 }
810#endif
kasperl@chromium.org7be3c992009-03-12 07:19:55 +0000811 return RegExpEngine::CompilationResult(*code, next_register_);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000812}
813
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000814
ager@chromium.org32912102009-01-16 10:38:43 +0000815bool Trace::DeferredAction::Mentions(int that) {
816 if (type() == ActionNode::CLEAR_CAPTURES) {
817 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
818 return range.Contains(that);
819 } else {
820 return reg() == that;
821 }
822}
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000823
ager@chromium.org32912102009-01-16 10:38:43 +0000824
825bool Trace::mentions_reg(int reg) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000826 for (DeferredAction* action = actions_;
827 action != NULL;
828 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000829 if (action->Mentions(reg))
830 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000831 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000832 return false;
833}
834
835
ager@chromium.org32912102009-01-16 10:38:43 +0000836bool Trace::GetStoredPosition(int reg, int* cp_offset) {
837 ASSERT_EQ(0, *cp_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000838 for (DeferredAction* action = actions_;
839 action != NULL;
840 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000841 if (action->Mentions(reg)) {
842 if (action->type() == ActionNode::STORE_POSITION) {
843 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
844 return true;
845 } else {
846 return false;
847 }
848 }
849 }
850 return false;
851}
852
853
854int Trace::FindAffectedRegisters(OutSet* affected_registers) {
855 int max_register = RegExpCompiler::kNoRegister;
856 for (DeferredAction* action = actions_;
857 action != NULL;
858 action = action->next()) {
859 if (action->type() == ActionNode::CLEAR_CAPTURES) {
860 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
861 for (int i = range.from(); i <= range.to(); i++)
862 affected_registers->Set(i);
863 if (range.to() > max_register) max_register = range.to();
864 } else {
865 affected_registers->Set(action->reg());
866 if (action->reg() > max_register) max_register = action->reg();
867 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000868 }
869 return max_register;
870}
871
872
ager@chromium.org32912102009-01-16 10:38:43 +0000873void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
874 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000875 OutSet& registers_to_pop,
876 OutSet& registers_to_clear) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000877 for (int reg = max_register; reg >= 0; reg--) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000878 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
879 else if (registers_to_clear.Get(reg)) {
880 int clear_to = reg;
881 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
882 reg--;
883 }
884 assembler->ClearRegisters(reg, clear_to);
885 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000886 }
887}
888
889
ager@chromium.org32912102009-01-16 10:38:43 +0000890void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
891 int max_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000892 OutSet& affected_registers,
893 OutSet* registers_to_pop,
894 OutSet* registers_to_clear) {
895 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
896 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
897
ager@chromium.org8bb60582008-12-11 12:02:20 +0000898 for (int reg = 0; reg <= max_register; reg++) {
899 if (!affected_registers.Get(reg)) {
900 continue;
901 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000902 // Count pushes performed to force a stack limit check occasionally.
903 int pushes = 0;
904
905 // The chronologically first deferred action in the trace
906 // is used to infer the action needed to restore a register
907 // to its previous state (or not, if it's safe to ignore it).
908 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
909 DeferredActionUndoType undo_action = IGNORE;
910
ager@chromium.org8bb60582008-12-11 12:02:20 +0000911 int value = 0;
912 bool absolute = false;
ager@chromium.org32912102009-01-16 10:38:43 +0000913 bool clear = false;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000914 int store_position = -1;
915 // This is a little tricky because we are scanning the actions in reverse
916 // historical order (newest first).
917 for (DeferredAction* action = actions_;
918 action != NULL;
919 action = action->next()) {
ager@chromium.org32912102009-01-16 10:38:43 +0000920 if (action->Mentions(reg)) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000921 switch (action->type()) {
922 case ActionNode::SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +0000923 Trace::DeferredSetRegister* psr =
924 static_cast<Trace::DeferredSetRegister*>(action);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000925 if (!absolute) {
926 value += psr->value();
927 absolute = true;
928 }
929 // SET_REGISTER is currently only used for newly introduced loop
930 // counters. They can have a significant previous value if they
931 // occour in a loop. TODO(lrn): Propagate this information, so
932 // we can set undo_action to IGNORE if we know there is no value to
933 // restore.
934 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000935 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +0000936 ASSERT(!clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000937 break;
938 }
939 case ActionNode::INCREMENT_REGISTER:
940 if (!absolute) {
941 value++;
942 }
943 ASSERT_EQ(store_position, -1);
ager@chromium.org32912102009-01-16 10:38:43 +0000944 ASSERT(!clear);
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000945 undo_action = RESTORE;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000946 break;
947 case ActionNode::STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +0000948 Trace::DeferredCapture* pc =
949 static_cast<Trace::DeferredCapture*>(action);
950 if (!clear && store_position == -1) {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000951 store_position = pc->cp_offset();
952 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000953
954 // For captures we know that stores and clears alternate.
955 // Other register, are never cleared, and if the occur
956 // inside a loop, they might be assigned more than once.
957 if (reg <= 1) {
958 // Registers zero and one, aka "capture zero", is
959 // always set correctly if we succeed. There is no
960 // need to undo a setting on backtrack, because we
961 // will set it again or fail.
962 undo_action = IGNORE;
963 } else {
964 undo_action = pc->is_capture() ? CLEAR : RESTORE;
965 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000966 ASSERT(!absolute);
967 ASSERT_EQ(value, 0);
968 break;
969 }
ager@chromium.org32912102009-01-16 10:38:43 +0000970 case ActionNode::CLEAR_CAPTURES: {
971 // Since we're scanning in reverse order, if we've already
972 // set the position we have to ignore historically earlier
973 // clearing operations.
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000974 if (store_position == -1) {
ager@chromium.org32912102009-01-16 10:38:43 +0000975 clear = true;
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000976 }
977 undo_action = RESTORE;
ager@chromium.org32912102009-01-16 10:38:43 +0000978 ASSERT(!absolute);
979 ASSERT_EQ(value, 0);
980 break;
981 }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000982 default:
983 UNREACHABLE();
984 break;
985 }
986 }
987 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +0000988 // Prepare for the undo-action (e.g., push if it's going to be popped).
989 if (undo_action == RESTORE) {
990 pushes++;
991 RegExpMacroAssembler::StackCheckFlag stack_check =
992 RegExpMacroAssembler::kNoStackLimitCheck;
993 if (pushes == push_limit) {
994 stack_check = RegExpMacroAssembler::kCheckStackLimit;
995 pushes = 0;
996 }
997
998 assembler->PushRegister(reg, stack_check);
999 registers_to_pop->Set(reg);
1000 } else if (undo_action == CLEAR) {
1001 registers_to_clear->Set(reg);
1002 }
1003 // Perform the chronologically last action (or accumulated increment)
1004 // for the register.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001005 if (store_position != -1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001006 assembler->WriteCurrentPositionToRegister(reg, store_position);
ager@chromium.org32912102009-01-16 10:38:43 +00001007 } else if (clear) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001008 assembler->ClearRegisters(reg, reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001009 } else if (absolute) {
1010 assembler->SetRegister(reg, value);
1011 } else if (value != 0) {
1012 assembler->AdvanceRegister(reg, value);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001013 }
1014 }
1015}
1016
1017
ager@chromium.org8bb60582008-12-11 12:02:20 +00001018// This is called as we come into a loop choice node and some other tricky
ager@chromium.org32912102009-01-16 10:38:43 +00001019// nodes. It normalizes the state of the code generator to ensure we can
ager@chromium.org8bb60582008-12-11 12:02:20 +00001020// generate generic code.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001021void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001022 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001023
iposva@chromium.org245aa852009-02-10 00:49:54 +00001024 ASSERT(!is_trivial());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001025
1026 if (actions_ == NULL && backtrack() == NULL) {
1027 // Here we just have some deferred cp advances to fix and we are back to
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001028 // a normal situation. We may also have to forget some information gained
1029 // through a quick check that was already performed.
1030 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001031 // Create a new trivial state and generate the node with that.
ager@chromium.org32912102009-01-16 10:38:43 +00001032 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001033 successor->Emit(compiler, &new_state);
1034 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001035 }
1036
1037 // Generate deferred actions here along with code to undo them again.
1038 OutSet affected_registers;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001039
ager@chromium.org381abbb2009-02-25 13:23:22 +00001040 if (backtrack() != NULL) {
1041 // Here we have a concrete backtrack location. These are set up by choice
1042 // nodes and so they indicate that we have a deferred save of the current
1043 // position which we may need to emit here.
1044 assembler->PushCurrentPosition();
1045 }
1046
ager@chromium.org8bb60582008-12-11 12:02:20 +00001047 int max_register = FindAffectedRegisters(&affected_registers);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001048 OutSet registers_to_pop;
1049 OutSet registers_to_clear;
1050 PerformDeferredActions(assembler,
1051 max_register,
1052 affected_registers,
1053 &registers_to_pop,
1054 &registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001055 if (cp_offset_ != 0) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001056 assembler->AdvanceCurrentPosition(cp_offset_);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001057 }
1058
1059 // Create a new trivial state and generate the node with that.
1060 Label undo;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001061 assembler->PushBacktrack(&undo);
ager@chromium.org32912102009-01-16 10:38:43 +00001062 Trace new_state;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001063 successor->Emit(compiler, &new_state);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001064
1065 // On backtrack we need to restore state.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001066 assembler->Bind(&undo);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001067 RestoreAffectedRegisters(assembler,
1068 max_register,
1069 registers_to_pop,
1070 registers_to_clear);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001071 if (backtrack() == NULL) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001072 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001073 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001074 assembler->PopCurrentPosition();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001075 assembler->GoTo(backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001076 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001077}
1078
1079
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001080void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001081 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001082
1083 // Omit flushing the trace. We discard the entire stack frame anyway.
1084
ager@chromium.org8bb60582008-12-11 12:02:20 +00001085 if (!label()->is_bound()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001086 // We are completely independent of the trace, since we ignore it,
1087 // so this code can be used as the generic version.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001088 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001089 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001090
1091 // Throw away everything on the backtrack stack since the start
1092 // of the negative submatch and restore the character position.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001093 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1094 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001095 if (clear_capture_count_ > 0) {
1096 // Clear any captures that might have been performed during the success
1097 // of the body of the negative look-ahead.
1098 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1099 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1100 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001101 // Now that we have unwound the stack we find at the top of the stack the
1102 // backtrack that the BeginSubmatch node got.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001103 assembler->Backtrack();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001104}
1105
1106
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001107void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00001108 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001109 trace->Flush(compiler, this);
1110 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001111 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001112 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001113 if (!label()->is_bound()) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001114 assembler->Bind(label());
ager@chromium.org8bb60582008-12-11 12:02:20 +00001115 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001116 switch (action_) {
1117 case ACCEPT:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001118 assembler->Succeed();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001119 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001120 case BACKTRACK:
ager@chromium.org32912102009-01-16 10:38:43 +00001121 assembler->GoTo(trace->backtrack());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001122 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001123 case NEGATIVE_SUBMATCH_SUCCESS:
1124 // This case is handled in a different virtual method.
1125 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001126 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001127 UNIMPLEMENTED();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001128}
1129
1130
1131void GuardedAlternative::AddGuard(Guard* guard) {
1132 if (guards_ == NULL)
1133 guards_ = new ZoneList<Guard*>(1);
1134 guards_->Add(guard);
1135}
1136
1137
ager@chromium.org8bb60582008-12-11 12:02:20 +00001138ActionNode* ActionNode::SetRegister(int reg,
1139 int val,
1140 RegExpNode* on_success) {
1141 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001142 result->data_.u_store_register.reg = reg;
1143 result->data_.u_store_register.value = val;
1144 return result;
1145}
1146
1147
1148ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1149 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1150 result->data_.u_increment_register.reg = reg;
1151 return result;
1152}
1153
1154
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001155ActionNode* ActionNode::StorePosition(int reg,
1156 bool is_capture,
1157 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001158 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1159 result->data_.u_position_register.reg = reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001160 result->data_.u_position_register.is_capture = is_capture;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001161 return result;
1162}
1163
1164
ager@chromium.org32912102009-01-16 10:38:43 +00001165ActionNode* ActionNode::ClearCaptures(Interval range,
1166 RegExpNode* on_success) {
1167 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1168 result->data_.u_clear_captures.range_from = range.from();
1169 result->data_.u_clear_captures.range_to = range.to();
1170 return result;
1171}
1172
1173
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001174ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1175 int position_reg,
1176 RegExpNode* on_success) {
1177 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1178 result->data_.u_submatch.stack_pointer_register = stack_reg;
1179 result->data_.u_submatch.current_position_register = position_reg;
1180 return result;
1181}
1182
1183
ager@chromium.org8bb60582008-12-11 12:02:20 +00001184ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1185 int position_reg,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001186 int clear_register_count,
1187 int clear_register_from,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001188 RegExpNode* on_success) {
1189 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001190 result->data_.u_submatch.stack_pointer_register = stack_reg;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001191 result->data_.u_submatch.current_position_register = position_reg;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001192 result->data_.u_submatch.clear_register_count = clear_register_count;
1193 result->data_.u_submatch.clear_register_from = clear_register_from;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001194 return result;
1195}
1196
1197
ager@chromium.org32912102009-01-16 10:38:43 +00001198ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1199 int repetition_register,
1200 int repetition_limit,
1201 RegExpNode* on_success) {
1202 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1203 result->data_.u_empty_match_check.start_register = start_register;
1204 result->data_.u_empty_match_check.repetition_register = repetition_register;
1205 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1206 return result;
1207}
1208
1209
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001210#define DEFINE_ACCEPT(Type) \
1211 void Type##Node::Accept(NodeVisitor* visitor) { \
1212 visitor->Visit##Type(this); \
1213 }
1214FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1215#undef DEFINE_ACCEPT
1216
1217
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001218void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1219 visitor->VisitLoopChoice(this);
1220}
1221
1222
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001223// -------------------------------------------------------------------
1224// Emit code.
1225
1226
1227void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1228 Guard* guard,
ager@chromium.org32912102009-01-16 10:38:43 +00001229 Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001230 switch (guard->op()) {
1231 case Guard::LT:
ager@chromium.org32912102009-01-16 10:38:43 +00001232 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001233 macro_assembler->IfRegisterGE(guard->reg(),
1234 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001235 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001236 break;
1237 case Guard::GEQ:
ager@chromium.org32912102009-01-16 10:38:43 +00001238 ASSERT(!trace->mentions_reg(guard->reg()));
ager@chromium.org8bb60582008-12-11 12:02:20 +00001239 macro_assembler->IfRegisterLT(guard->reg(),
1240 guard->value(),
ager@chromium.org32912102009-01-16 10:38:43 +00001241 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001242 break;
1243 }
1244}
1245
1246
1247static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1248static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1249
1250
ager@chromium.org381abbb2009-02-25 13:23:22 +00001251// Returns the number of characters in the equivalence class, omitting those
1252// that cannot occur in the source string because it is ASCII.
1253static int GetCaseIndependentLetters(uc16 character,
1254 bool ascii_subject,
1255 unibrow::uchar* letters) {
1256 int length = uncanonicalize.get(character, '\0', letters);
1257 // Unibrow returns 0 or 1 for characters where case independependence is
1258 // trivial.
1259 if (length == 0) {
1260 letters[0] = character;
1261 length = 1;
1262 }
1263 if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1264 return length;
1265 }
1266 // The standard requires that non-ASCII characters cannot have ASCII
1267 // character codes in their equivalence class.
1268 return 0;
1269}
1270
1271
1272static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
1273 uc16 c,
1274 Label* on_failure,
1275 int cp_offset,
1276 bool check,
1277 bool preloaded) {
1278 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1279 bool bound_checked = false;
1280 if (!preloaded) {
1281 assembler->LoadCurrentCharacter(
1282 cp_offset,
1283 on_failure,
1284 check);
1285 bound_checked = true;
1286 }
1287 assembler->CheckNotCharacter(c, on_failure);
1288 return bound_checked;
1289}
1290
1291
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001292// Only emits non-letters (things that don't have case). Only used for case
1293// independent matches.
ager@chromium.org381abbb2009-02-25 13:23:22 +00001294static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
1295 uc16 c,
1296 Label* on_failure,
1297 int cp_offset,
1298 bool check,
1299 bool preloaded) {
1300 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1301 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001302 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001303 int length = GetCaseIndependentLetters(c, ascii, chars);
1304 if (length < 1) {
1305 // This can't match. Must be an ASCII subject and a non-ASCII character.
1306 // We do not need to do anything since the ASCII pass already handled this.
1307 return false; // Bounds not checked.
1308 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001309 bool checked = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001310 // We handle the length > 1 case in a later pass.
1311 if (length == 1) {
1312 if (ascii && c > String::kMaxAsciiCharCodeU) {
1313 // Can't match - see above.
1314 return false; // Bounds not checked.
1315 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001316 if (!preloaded) {
1317 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1318 checked = check;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001319 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001320 macro_assembler->CheckNotCharacter(c, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001321 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001322 return checked;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001323}
1324
1325
1326static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001327 bool ascii,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001328 uc16 c1,
1329 uc16 c2,
1330 Label* on_failure) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001331 uc16 char_mask;
1332 if (ascii) {
1333 char_mask = String::kMaxAsciiCharCode;
1334 } else {
1335 char_mask = String::kMaxUC16CharCode;
1336 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001337 uc16 exor = c1 ^ c2;
1338 // Check whether exor has only one bit set.
1339 if (((exor - 1) & exor) == 0) {
1340 // If c1 and c2 differ only by one bit.
1341 // Ecma262UnCanonicalize always gives the highest number last.
1342 ASSERT(c2 > c1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001343 uc16 mask = char_mask ^ exor;
1344 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001345 return true;
1346 }
1347 ASSERT(c2 > c1);
1348 uc16 diff = c2 - c1;
1349 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1350 // If the characters differ by 2^n but don't differ by one bit then
1351 // subtract the difference from the found character, then do the or
1352 // trick. We avoid the theoretical case where negative numbers are
1353 // involved in order to simplify code generation.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001354 uc16 mask = char_mask ^ diff;
1355 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1356 diff,
1357 mask,
1358 on_failure);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001359 return true;
1360 }
1361 return false;
1362}
1363
1364
ager@chromium.org381abbb2009-02-25 13:23:22 +00001365typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
1366 uc16 c,
1367 Label* on_failure,
1368 int cp_offset,
1369 bool check,
1370 bool preloaded);
1371
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001372// Only emits letters (things that have case). Only used for case independent
1373// matches.
ager@chromium.org381abbb2009-02-25 13:23:22 +00001374static inline bool EmitAtomLetter(RegExpCompiler* compiler,
1375 uc16 c,
1376 Label* on_failure,
1377 int cp_offset,
1378 bool check,
1379 bool preloaded) {
1380 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1381 bool ascii = compiler->ascii();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001382 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001383 int length = GetCaseIndependentLetters(c, ascii, chars);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001384 if (length <= 1) return false;
1385 // We may not need to check against the end of the input string
1386 // if this character lies before a character that matched.
1387 if (!preloaded) {
1388 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001389 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001390 Label ok;
1391 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1392 switch (length) {
1393 case 2: {
1394 if (ShortCutEmitCharacterPair(macro_assembler,
1395 ascii,
1396 chars[0],
1397 chars[1],
1398 on_failure)) {
1399 } else {
1400 macro_assembler->CheckCharacter(chars[0], &ok);
1401 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1402 macro_assembler->Bind(&ok);
1403 }
1404 break;
1405 }
1406 case 4:
1407 macro_assembler->CheckCharacter(chars[3], &ok);
1408 // Fall through!
1409 case 3:
1410 macro_assembler->CheckCharacter(chars[0], &ok);
1411 macro_assembler->CheckCharacter(chars[1], &ok);
1412 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1413 macro_assembler->Bind(&ok);
1414 break;
1415 default:
1416 UNREACHABLE();
1417 break;
1418 }
1419 return true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001420}
1421
1422
1423static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1424 RegExpCharacterClass* cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001425 bool ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00001426 Label* on_failure,
1427 int cp_offset,
1428 bool check_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001429 bool preloaded) {
1430 if (cc->is_standard() &&
1431 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1432 cp_offset,
1433 check_offset,
1434 on_failure)) {
1435 return;
1436 }
1437
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001438 ZoneList<CharacterRange>* ranges = cc->ranges();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001439 int max_char;
1440 if (ascii) {
1441 max_char = String::kMaxAsciiCharCode;
1442 } else {
1443 max_char = String::kMaxUC16CharCode;
1444 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001445
1446 Label success;
1447
1448 Label* char_is_in_class =
1449 cc->is_negated() ? on_failure : &success;
1450
1451 int range_count = ranges->length();
1452
ager@chromium.org8bb60582008-12-11 12:02:20 +00001453 int last_valid_range = range_count - 1;
1454 while (last_valid_range >= 0) {
1455 CharacterRange& range = ranges->at(last_valid_range);
1456 if (range.from() <= max_char) {
1457 break;
1458 }
1459 last_valid_range--;
1460 }
1461
1462 if (last_valid_range < 0) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001463 if (!cc->is_negated()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001464 // TODO(plesner): We can remove this when the node level does our
1465 // ASCII optimizations for us.
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001466 macro_assembler->GoTo(on_failure);
1467 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001468 if (check_offset) {
1469 macro_assembler->CheckPosition(cp_offset, on_failure);
1470 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001471 return;
1472 }
1473
ager@chromium.org8bb60582008-12-11 12:02:20 +00001474 if (last_valid_range == 0 &&
1475 !cc->is_negated() &&
1476 ranges->at(0).IsEverything(max_char)) {
1477 // This is a common case hit by non-anchored expressions.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001478 if (check_offset) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001479 macro_assembler->CheckPosition(cp_offset, on_failure);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001480 }
1481 return;
1482 }
1483
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001484 if (!preloaded) {
1485 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001486 }
1487
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001488 for (int i = 0; i < last_valid_range; i++) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001489 CharacterRange& range = ranges->at(i);
1490 Label next_range;
1491 uc16 from = range.from();
1492 uc16 to = range.to();
ager@chromium.org8bb60582008-12-11 12:02:20 +00001493 if (from > max_char) {
1494 continue;
1495 }
1496 if (to > max_char) to = max_char;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001497 if (to == from) {
1498 macro_assembler->CheckCharacter(to, char_is_in_class);
1499 } else {
1500 if (from != 0) {
1501 macro_assembler->CheckCharacterLT(from, &next_range);
1502 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001503 if (to != max_char) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001504 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1505 } else {
1506 macro_assembler->GoTo(char_is_in_class);
1507 }
1508 }
1509 macro_assembler->Bind(&next_range);
1510 }
1511
ager@chromium.org8bb60582008-12-11 12:02:20 +00001512 CharacterRange& range = ranges->at(last_valid_range);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001513 uc16 from = range.from();
1514 uc16 to = range.to();
1515
ager@chromium.org8bb60582008-12-11 12:02:20 +00001516 if (to > max_char) to = max_char;
1517 ASSERT(to >= from);
1518
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001519 if (to == from) {
1520 if (cc->is_negated()) {
1521 macro_assembler->CheckCharacter(to, on_failure);
1522 } else {
1523 macro_assembler->CheckNotCharacter(to, on_failure);
1524 }
1525 } else {
1526 if (from != 0) {
1527 if (cc->is_negated()) {
1528 macro_assembler->CheckCharacterLT(from, &success);
1529 } else {
1530 macro_assembler->CheckCharacterLT(from, on_failure);
1531 }
1532 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001533 if (to != String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001534 if (cc->is_negated()) {
1535 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1536 } else {
1537 macro_assembler->CheckCharacterGT(to, on_failure);
1538 }
1539 } else {
1540 if (cc->is_negated()) {
1541 macro_assembler->GoTo(on_failure);
1542 }
1543 }
1544 }
1545 macro_assembler->Bind(&success);
1546}
1547
1548
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001549RegExpNode::~RegExpNode() {
1550}
1551
1552
ager@chromium.org8bb60582008-12-11 12:02:20 +00001553RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001554 Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001555 // If we are generating a greedy loop then don't stop and don't reuse code.
ager@chromium.org32912102009-01-16 10:38:43 +00001556 if (trace->stop_node() != NULL) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001557 return CONTINUE;
1558 }
1559
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001560 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00001561 if (trace->is_trivial()) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001562 if (label_.is_bound()) {
1563 // We are being asked to generate a generic version, but that's already
1564 // been done so just go to it.
1565 macro_assembler->GoTo(&label_);
1566 return DONE;
1567 }
1568 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1569 // To avoid too deep recursion we push the node to the work queue and just
1570 // generate a goto here.
1571 compiler->AddWork(this);
1572 macro_assembler->GoTo(&label_);
1573 return DONE;
1574 }
1575 // Generate generic version of the node and bind the label for later use.
1576 macro_assembler->Bind(&label_);
1577 return CONTINUE;
1578 }
1579
1580 // We are being asked to make a non-generic version. Keep track of how many
1581 // non-generic versions we generate so as not to overdo it.
ager@chromium.org32912102009-01-16 10:38:43 +00001582 trace_count_++;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001583 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00001584 trace_count_ < kMaxCopiesCodeGenerated &&
ager@chromium.org8bb60582008-12-11 12:02:20 +00001585 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1586 return CONTINUE;
1587 }
1588
ager@chromium.org32912102009-01-16 10:38:43 +00001589 // If we get here code has been generated for this node too many times or
1590 // recursion is too deep. Time to switch to a generic version. The code for
ager@chromium.org8bb60582008-12-11 12:02:20 +00001591 // generic versions above can handle deep recursion properly.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001592 trace->Flush(compiler, this);
1593 return DONE;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001594}
1595
1596
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001597int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001598 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1599 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001600 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001601}
1602
1603
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001604int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1605 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1606 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1607}
1608
1609
1610int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1611 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1612 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1613}
1614
1615
1616int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001617 int answer = Length();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001618 if (answer >= still_to_find) return answer;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001619 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001620 return answer + on_success()->EatsAtLeast(still_to_find - answer,
1621 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001622}
1623
1624
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001625int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
1626 int recursion_depth) {
1627 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1628 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1629 // afterwards.
1630 RegExpNode* node = alternatives_->at(1).node();
1631 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
1632}
1633
1634
1635void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1636 QuickCheckDetails* details,
1637 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00001638 int filled_in,
1639 bool not_at_start) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001640 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1641 // afterwards.
1642 RegExpNode* node = alternatives_->at(1).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00001643 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001644}
1645
1646
1647int ChoiceNode::EatsAtLeastHelper(int still_to_find,
1648 int recursion_depth,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001649 RegExpNode* ignore_this_node) {
1650 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1651 int min = 100;
1652 int choice_count = alternatives_->length();
1653 for (int i = 0; i < choice_count; i++) {
1654 RegExpNode* node = alternatives_->at(i).node();
1655 if (node == ignore_this_node) continue;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001656 int node_eats_at_least = node->EatsAtLeast(still_to_find,
1657 recursion_depth + 1);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001658 if (node_eats_at_least < min) min = node_eats_at_least;
1659 }
1660 return min;
1661}
1662
1663
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001664int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1665 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001666}
1667
1668
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001669int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1670 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001671}
1672
1673
1674// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1675static inline uint32_t SmearBitsRight(uint32_t v) {
1676 v |= v >> 1;
1677 v |= v >> 2;
1678 v |= v >> 4;
1679 v |= v >> 8;
1680 v |= v >> 16;
1681 return v;
1682}
1683
1684
1685bool QuickCheckDetails::Rationalize(bool asc) {
1686 bool found_useful_op = false;
1687 uint32_t char_mask;
1688 if (asc) {
1689 char_mask = String::kMaxAsciiCharCode;
1690 } else {
1691 char_mask = String::kMaxUC16CharCode;
1692 }
1693 mask_ = 0;
1694 value_ = 0;
1695 int char_shift = 0;
1696 for (int i = 0; i < characters_; i++) {
1697 Position* pos = &positions_[i];
1698 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1699 found_useful_op = true;
1700 }
1701 mask_ |= (pos->mask & char_mask) << char_shift;
1702 value_ |= (pos->value & char_mask) << char_shift;
1703 char_shift += asc ? 8 : 16;
1704 }
1705 return found_useful_op;
1706}
1707
1708
1709bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00001710 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001711 bool preload_has_checked_bounds,
1712 Label* on_possible_success,
1713 QuickCheckDetails* details,
1714 bool fall_through_on_failure) {
1715 if (details->characters() == 0) return false;
iposva@chromium.org245aa852009-02-10 00:49:54 +00001716 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
1717 if (details->cannot_match()) return false;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001718 if (!details->Rationalize(compiler->ascii())) return false;
1719 uint32_t mask = details->mask();
1720 uint32_t value = details->value();
1721
1722 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1723
ager@chromium.org32912102009-01-16 10:38:43 +00001724 if (trace->characters_preloaded() != details->characters()) {
1725 assembler->LoadCurrentCharacter(trace->cp_offset(),
1726 trace->backtrack(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001727 !preload_has_checked_bounds,
1728 details->characters());
1729 }
1730
1731
1732 bool need_mask = true;
1733
1734 if (details->characters() == 1) {
1735 // If number of characters preloaded is 1 then we used a byte or 16 bit
1736 // load so the value is already masked down.
1737 uint32_t char_mask;
1738 if (compiler->ascii()) {
1739 char_mask = String::kMaxAsciiCharCode;
1740 } else {
1741 char_mask = String::kMaxUC16CharCode;
1742 }
1743 if ((mask & char_mask) == char_mask) need_mask = false;
1744 mask &= char_mask;
1745 } else {
1746 // For 2-character preloads in ASCII mode we also use a 16 bit load with
1747 // zero extend.
1748 if (details->characters() == 2 && compiler->ascii()) {
1749 if ((mask & 0xffff) == 0xffff) need_mask = false;
1750 } else {
1751 if (mask == 0xffffffff) need_mask = false;
1752 }
1753 }
1754
1755 if (fall_through_on_failure) {
1756 if (need_mask) {
1757 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1758 } else {
1759 assembler->CheckCharacter(value, on_possible_success);
1760 }
1761 } else {
1762 if (need_mask) {
ager@chromium.org32912102009-01-16 10:38:43 +00001763 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001764 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00001765 assembler->CheckNotCharacter(value, trace->backtrack());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001766 }
1767 }
1768 return true;
1769}
1770
1771
1772// Here is the meat of GetQuickCheckDetails (see also the comment on the
1773// super-class in the .h file).
1774//
1775// We iterate along the text object, building up for each character a
1776// mask and value that can be used to test for a quick failure to match.
1777// The masks and values for the positions will be combined into a single
1778// machine word for the current character width in order to be used in
1779// generating a quick check.
1780void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1781 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00001782 int characters_filled_in,
1783 bool not_at_start) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001784 ASSERT(characters_filled_in < details->characters());
1785 int characters = details->characters();
1786 int char_mask;
1787 int char_shift;
1788 if (compiler->ascii()) {
1789 char_mask = String::kMaxAsciiCharCode;
1790 char_shift = 8;
1791 } else {
1792 char_mask = String::kMaxUC16CharCode;
1793 char_shift = 16;
1794 }
1795 for (int k = 0; k < elms_->length(); k++) {
1796 TextElement elm = elms_->at(k);
1797 if (elm.type == TextElement::ATOM) {
1798 Vector<const uc16> quarks = elm.data.u_atom->data();
1799 for (int i = 0; i < characters && i < quarks.length(); i++) {
1800 QuickCheckDetails::Position* pos =
1801 details->positions(characters_filled_in);
ager@chromium.org6f10e412009-02-13 10:11:16 +00001802 uc16 c = quarks[i];
1803 if (c > char_mask) {
1804 // If we expect a non-ASCII character from an ASCII string,
1805 // there is no way we can match. Not even case independent
1806 // matching can turn an ASCII character into non-ASCII or
1807 // vice versa.
1808 details->set_cannot_match();
ager@chromium.org381abbb2009-02-25 13:23:22 +00001809 pos->determines_perfectly = false;
ager@chromium.org6f10e412009-02-13 10:11:16 +00001810 return;
1811 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001812 if (compiler->ignore_case()) {
1813 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
ager@chromium.org381abbb2009-02-25 13:23:22 +00001814 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
1815 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1816 if (length == 1) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001817 // This letter has no case equivalents, so it's nice and simple
1818 // and the mask-compare will determine definitely whether we have
1819 // a match at this character position.
1820 pos->mask = char_mask;
1821 pos->value = c;
1822 pos->determines_perfectly = true;
1823 } else {
1824 uint32_t common_bits = char_mask;
1825 uint32_t bits = chars[0];
1826 for (int j = 1; j < length; j++) {
1827 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1828 common_bits ^= differing_bits;
1829 bits &= common_bits;
1830 }
1831 // If length is 2 and common bits has only one zero in it then
1832 // our mask and compare instruction will determine definitely
1833 // whether we have a match at this character position. Otherwise
1834 // it can only be an approximate check.
1835 uint32_t one_zero = (common_bits | ~char_mask);
1836 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1837 pos->determines_perfectly = true;
1838 }
1839 pos->mask = common_bits;
1840 pos->value = bits;
1841 }
1842 } else {
1843 // Don't ignore case. Nice simple case where the mask-compare will
1844 // determine definitely whether we have a match at this character
1845 // position.
1846 pos->mask = char_mask;
ager@chromium.org6f10e412009-02-13 10:11:16 +00001847 pos->value = c;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001848 pos->determines_perfectly = true;
1849 }
1850 characters_filled_in++;
1851 ASSERT(characters_filled_in <= details->characters());
1852 if (characters_filled_in == details->characters()) {
1853 return;
1854 }
1855 }
1856 } else {
1857 QuickCheckDetails::Position* pos =
1858 details->positions(characters_filled_in);
1859 RegExpCharacterClass* tree = elm.data.u_char_class;
1860 ZoneList<CharacterRange>* ranges = tree->ranges();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001861 if (tree->is_negated()) {
1862 // A quick check uses multi-character mask and compare. There is no
1863 // useful way to incorporate a negative char class into this scheme
1864 // so we just conservatively create a mask and value that will always
1865 // succeed.
1866 pos->mask = 0;
1867 pos->value = 0;
1868 } else {
ager@chromium.org381abbb2009-02-25 13:23:22 +00001869 int first_range = 0;
1870 while (ranges->at(first_range).from() > char_mask) {
1871 first_range++;
1872 if (first_range == ranges->length()) {
1873 details->set_cannot_match();
1874 pos->determines_perfectly = false;
1875 return;
1876 }
1877 }
1878 CharacterRange range = ranges->at(first_range);
1879 uc16 from = range.from();
1880 uc16 to = range.to();
1881 if (to > char_mask) {
1882 to = char_mask;
1883 }
1884 uint32_t differing_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001885 // A mask and compare is only perfect if the differing bits form a
1886 // number like 00011111 with one single block of trailing 1s.
1887 if ((differing_bits & (differing_bits + 1)) == 0) {
1888 pos->determines_perfectly = true;
1889 }
1890 uint32_t common_bits = ~SmearBitsRight(differing_bits);
ager@chromium.org381abbb2009-02-25 13:23:22 +00001891 uint32_t bits = (from & common_bits);
1892 for (int i = first_range + 1; i < ranges->length(); i++) {
1893 CharacterRange range = ranges->at(i);
1894 uc16 from = range.from();
1895 uc16 to = range.to();
1896 if (from > char_mask) continue;
1897 if (to > char_mask) to = char_mask;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001898 // Here we are combining more ranges into the mask and compare
1899 // value. With each new range the mask becomes more sparse and
1900 // so the chances of a false positive rise. A character class
1901 // with multiple ranges is assumed never to be equivalent to a
1902 // mask and compare operation.
1903 pos->determines_perfectly = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001904 uint32_t new_common_bits = (from ^ to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001905 new_common_bits = ~SmearBitsRight(new_common_bits);
1906 common_bits &= new_common_bits;
1907 bits &= new_common_bits;
ager@chromium.org381abbb2009-02-25 13:23:22 +00001908 uint32_t differing_bits = (from & common_bits) ^ bits;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001909 common_bits ^= differing_bits;
1910 bits &= common_bits;
1911 }
1912 pos->mask = common_bits;
1913 pos->value = bits;
1914 }
1915 characters_filled_in++;
1916 ASSERT(characters_filled_in <= details->characters());
1917 if (characters_filled_in == details->characters()) {
1918 return;
1919 }
1920 }
1921 }
1922 ASSERT(characters_filled_in != details->characters());
iposva@chromium.org245aa852009-02-10 00:49:54 +00001923 on_success()-> GetQuickCheckDetails(details,
1924 compiler,
1925 characters_filled_in,
1926 true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001927}
1928
1929
1930void QuickCheckDetails::Clear() {
1931 for (int i = 0; i < characters_; i++) {
1932 positions_[i].mask = 0;
1933 positions_[i].value = 0;
1934 positions_[i].determines_perfectly = false;
1935 }
1936 characters_ = 0;
1937}
1938
1939
1940void QuickCheckDetails::Advance(int by, bool ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00001941 ASSERT(by >= 0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001942 if (by >= characters_) {
1943 Clear();
1944 return;
1945 }
1946 for (int i = 0; i < characters_ - by; i++) {
1947 positions_[i] = positions_[by + i];
1948 }
1949 for (int i = characters_ - by; i < characters_; i++) {
1950 positions_[i].mask = 0;
1951 positions_[i].value = 0;
1952 positions_[i].determines_perfectly = false;
1953 }
1954 characters_ -= by;
1955 // We could change mask_ and value_ here but we would never advance unless
1956 // they had already been used in a check and they won't be used again because
1957 // it would gain us nothing. So there's no point.
1958}
1959
1960
1961void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1962 ASSERT(characters_ == other->characters_);
iposva@chromium.org245aa852009-02-10 00:49:54 +00001963 if (other->cannot_match_) {
1964 return;
1965 }
1966 if (cannot_match_) {
1967 *this = *other;
1968 return;
1969 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001970 for (int i = from_index; i < characters_; i++) {
1971 QuickCheckDetails::Position* pos = positions(i);
1972 QuickCheckDetails::Position* other_pos = other->positions(i);
1973 if (pos->mask != other_pos->mask ||
1974 pos->value != other_pos->value ||
1975 !other_pos->determines_perfectly) {
1976 // Our mask-compare operation will be approximate unless we have the
1977 // exact same operation on both sides of the alternation.
1978 pos->determines_perfectly = false;
1979 }
1980 pos->mask &= other_pos->mask;
1981 pos->value &= pos->mask;
1982 other_pos->value &= pos->mask;
1983 uc16 differing_bits = (pos->value ^ other_pos->value);
1984 pos->mask &= ~differing_bits;
1985 pos->value &= pos->mask;
1986 }
1987}
1988
1989
ager@chromium.org32912102009-01-16 10:38:43 +00001990class VisitMarker {
1991 public:
1992 explicit VisitMarker(NodeInfo* info) : info_(info) {
1993 ASSERT(!info->visited);
1994 info->visited = true;
1995 }
1996 ~VisitMarker() {
1997 info_->visited = false;
1998 }
1999 private:
2000 NodeInfo* info_;
2001};
2002
2003
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002004void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2005 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002006 int characters_filled_in,
2007 bool not_at_start) {
ager@chromium.org32912102009-01-16 10:38:43 +00002008 if (body_can_be_zero_length_ || info()->visited) return;
2009 VisitMarker marker(info());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002010 return ChoiceNode::GetQuickCheckDetails(details,
2011 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002012 characters_filled_in,
2013 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002014}
2015
2016
2017void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2018 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002019 int characters_filled_in,
2020 bool not_at_start) {
2021 not_at_start = (not_at_start || not_at_start_);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002022 int choice_count = alternatives_->length();
2023 ASSERT(choice_count > 0);
2024 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2025 compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00002026 characters_filled_in,
2027 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002028 for (int i = 1; i < choice_count; i++) {
2029 QuickCheckDetails new_details(details->characters());
2030 RegExpNode* node = alternatives_->at(i).node();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002031 node->GetQuickCheckDetails(&new_details, compiler,
2032 characters_filled_in,
2033 not_at_start);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002034 // Here we merge the quick match details of the two branches.
2035 details->Merge(&new_details, characters_filled_in);
2036 }
2037}
2038
2039
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002040// Check for [0-9A-Z_a-z].
2041static void EmitWordCheck(RegExpMacroAssembler* assembler,
2042 Label* word,
2043 Label* non_word,
2044 bool fall_through_on_word) {
2045 assembler->CheckCharacterGT('z', non_word);
2046 assembler->CheckCharacterLT('0', non_word);
2047 assembler->CheckCharacterGT('a' - 1, word);
2048 assembler->CheckCharacterLT('9' + 1, word);
2049 assembler->CheckCharacterLT('A', non_word);
2050 assembler->CheckCharacterLT('Z' + 1, word);
2051 if (fall_through_on_word) {
2052 assembler->CheckNotCharacter('_', non_word);
2053 } else {
2054 assembler->CheckCharacter('_', word);
2055 }
2056}
2057
2058
2059// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2060// that matches newline or the start of input).
2061static void EmitHat(RegExpCompiler* compiler,
2062 RegExpNode* on_success,
2063 Trace* trace) {
2064 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2065 // We will be loading the previous character into the current character
2066 // register.
2067 Trace new_trace(*trace);
2068 new_trace.InvalidateCurrentCharacter();
2069
2070 Label ok;
2071 if (new_trace.cp_offset() == 0) {
2072 // The start of input counts as a newline in this context, so skip to
2073 // ok if we are at the start.
2074 assembler->CheckAtStart(&ok);
2075 }
2076 // We already checked that we are not at the start of input so it must be
2077 // OK to load the previous character.
2078 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2079 new_trace.backtrack(),
2080 false);
2081 // Newline means \n, \r, 0x2028 or 0x2029.
2082 if (!compiler->ascii()) {
2083 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2084 }
2085 assembler->CheckCharacter('\n', &ok);
2086 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2087 assembler->Bind(&ok);
2088 on_success->Emit(compiler, &new_trace);
2089}
2090
2091
2092// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2093static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2094 RegExpCompiler* compiler,
2095 RegExpNode* on_success,
2096 Trace* trace) {
2097 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2098 Label before_non_word;
2099 Label before_word;
2100 if (trace->characters_preloaded() != 1) {
2101 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2102 }
2103 // Fall through on non-word.
2104 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2105
2106 // We will be loading the previous character into the current character
2107 // register.
2108 Trace new_trace(*trace);
2109 new_trace.InvalidateCurrentCharacter();
2110
2111 Label ok;
2112 Label* boundary;
2113 Label* not_boundary;
2114 if (type == AssertionNode::AT_BOUNDARY) {
2115 boundary = &ok;
2116 not_boundary = new_trace.backtrack();
2117 } else {
2118 not_boundary = &ok;
2119 boundary = new_trace.backtrack();
2120 }
2121
2122 // Next character is not a word character.
2123 assembler->Bind(&before_non_word);
2124 if (new_trace.cp_offset() == 0) {
2125 // The start of input counts as a non-word character, so the question is
2126 // decided if we are at the start.
2127 assembler->CheckAtStart(not_boundary);
2128 }
2129 // We already checked that we are not at the start of input so it must be
2130 // OK to load the previous character.
2131 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2132 &ok, // Unused dummy label in this call.
2133 false);
2134 // Fall through on non-word.
2135 EmitWordCheck(assembler, boundary, not_boundary, false);
2136 assembler->GoTo(not_boundary);
2137
2138 // Next character is a word character.
2139 assembler->Bind(&before_word);
2140 if (new_trace.cp_offset() == 0) {
2141 // The start of input counts as a non-word character, so the question is
2142 // decided if we are at the start.
2143 assembler->CheckAtStart(boundary);
2144 }
2145 // We already checked that we are not at the start of input so it must be
2146 // OK to load the previous character.
2147 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2148 &ok, // Unused dummy label in this call.
2149 false);
2150 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2151 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2152
2153 assembler->Bind(&ok);
2154
2155 on_success->Emit(compiler, &new_trace);
2156}
2157
2158
iposva@chromium.org245aa852009-02-10 00:49:54 +00002159void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2160 RegExpCompiler* compiler,
2161 int filled_in,
2162 bool not_at_start) {
2163 if (type_ == AT_START && not_at_start) {
2164 details->set_cannot_match();
2165 return;
2166 }
2167 return on_success()->GetQuickCheckDetails(details,
2168 compiler,
2169 filled_in,
2170 not_at_start);
2171}
2172
2173
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002174void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2175 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2176 switch (type_) {
2177 case AT_END: {
2178 Label ok;
2179 assembler->CheckPosition(trace->cp_offset(), &ok);
2180 assembler->GoTo(trace->backtrack());
2181 assembler->Bind(&ok);
2182 break;
2183 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002184 case AT_START: {
2185 if (trace->at_start() == Trace::FALSE) {
2186 assembler->GoTo(trace->backtrack());
2187 return;
2188 }
2189 if (trace->at_start() == Trace::UNKNOWN) {
2190 assembler->CheckNotAtStart(trace->backtrack());
2191 Trace at_start_trace = *trace;
2192 at_start_trace.set_at_start(true);
2193 on_success()->Emit(compiler, &at_start_trace);
2194 return;
2195 }
2196 }
2197 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002198 case AFTER_NEWLINE:
2199 EmitHat(compiler, on_success(), trace);
2200 return;
2201 case AT_NON_BOUNDARY:
2202 case AT_BOUNDARY:
2203 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2204 return;
2205 }
2206 on_success()->Emit(compiler, trace);
2207}
2208
2209
ager@chromium.org381abbb2009-02-25 13:23:22 +00002210static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2211 if (quick_check == NULL) return false;
2212 if (offset >= quick_check->characters()) return false;
2213 return quick_check->positions(offset)->determines_perfectly;
2214}
2215
2216
2217static void UpdateBoundsCheck(int index, int* checked_up_to) {
2218 if (index > *checked_up_to) {
2219 *checked_up_to = index;
2220 }
2221}
2222
2223
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002224// We call this repeatedly to generate code for each pass over the text node.
2225// The passes are in increasing order of difficulty because we hope one
2226// of the first passes will fail in which case we are saved the work of the
2227// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2228// we will check the '%' in the first pass, the case independent 'a' in the
2229// second pass and the character class in the last pass.
2230//
2231// The passes are done from right to left, so for example to test for /bar/
2232// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2233// and then a 'b' with offset 0. This means we can avoid the end-of-input
2234// bounds check most of the time. In the example we only need to check for
2235// end-of-input when loading the putative 'r'.
2236//
2237// A slight complication involves the fact that the first character may already
2238// be fetched into a register by the previous node. In this case we want to
2239// do the test for that character first. We do this in separate passes. The
2240// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2241// pass has been performed then subsequent passes will have true in
2242// first_element_checked to indicate that that character does not need to be
2243// checked again.
2244//
ager@chromium.org32912102009-01-16 10:38:43 +00002245// In addition to all this we are passed a Trace, which can
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002246// contain an AlternativeGeneration object. In this AlternativeGeneration
2247// object we can see details of any quick check that was already passed in
2248// order to get to the code we are now generating. The quick check can involve
2249// loading characters, which means we do not need to recheck the bounds
2250// up to the limit the quick check already checked. In addition the quick
2251// check can have involved a mask and compare operation which may simplify
2252// or obviate the need for further checks at some character positions.
2253void TextNode::TextEmitPass(RegExpCompiler* compiler,
2254 TextEmitPassType pass,
2255 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +00002256 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002257 bool first_element_checked,
2258 int* checked_up_to) {
2259 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2260 bool ascii = compiler->ascii();
ager@chromium.org32912102009-01-16 10:38:43 +00002261 Label* backtrack = trace->backtrack();
2262 QuickCheckDetails* quick_check = trace->quick_check_performed();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002263 int element_count = elms_->length();
2264 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2265 TextElement elm = elms_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002266 int cp_offset = trace->cp_offset() + elm.cp_offset;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002267 if (elm.type == TextElement::ATOM) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002268 Vector<const uc16> quarks = elm.data.u_atom->data();
2269 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2270 if (first_element_checked && i == 0 && j == 0) continue;
2271 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2272 EmitCharacterFunction* emit_function = NULL;
2273 switch (pass) {
2274 case NON_ASCII_MATCH:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002275 ASSERT(ascii);
2276 if (quarks[j] > String::kMaxAsciiCharCode) {
2277 assembler->GoTo(backtrack);
2278 return;
2279 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002280 break;
2281 case NON_LETTER_CHARACTER_MATCH:
2282 emit_function = &EmitAtomNonLetter;
2283 break;
2284 case SIMPLE_CHARACTER_MATCH:
2285 emit_function = &EmitSimpleCharacter;
2286 break;
2287 case CASE_CHARACTER_MATCH:
2288 emit_function = &EmitAtomLetter;
2289 break;
2290 default:
2291 break;
2292 }
2293 if (emit_function != NULL) {
2294 bool bound_checked = emit_function(compiler,
ager@chromium.org6f10e412009-02-13 10:11:16 +00002295 quarks[j],
2296 backtrack,
2297 cp_offset + j,
2298 *checked_up_to < cp_offset + j,
2299 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002300 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002301 }
2302 }
2303 } else {
2304 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002305 if (pass == CHARACTER_CLASS_MATCH) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002306 if (first_element_checked && i == 0) continue;
2307 if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002308 RegExpCharacterClass* cc = elm.data.u_char_class;
2309 EmitCharClass(assembler,
2310 cc,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002311 ascii,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002312 backtrack,
2313 cp_offset,
2314 *checked_up_to < cp_offset,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002315 preloaded);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002316 UpdateBoundsCheck(cp_offset, checked_up_to);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002317 }
2318 }
2319 }
2320}
2321
2322
2323int TextNode::Length() {
2324 TextElement elm = elms_->last();
2325 ASSERT(elm.cp_offset >= 0);
2326 if (elm.type == TextElement::ATOM) {
2327 return elm.cp_offset + elm.data.u_atom->data().length();
2328 } else {
2329 return elm.cp_offset + 1;
2330 }
2331}
2332
2333
ager@chromium.org381abbb2009-02-25 13:23:22 +00002334bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2335 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2336 if (ignore_case) {
2337 return pass == SIMPLE_CHARACTER_MATCH;
2338 } else {
2339 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2340 }
2341}
2342
2343
ager@chromium.org8bb60582008-12-11 12:02:20 +00002344// This generates the code to match a text node. A text node can contain
2345// straight character sequences (possibly to be matched in a case-independent
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002346// way) and character classes. For efficiency we do not do this in a single
2347// pass from left to right. Instead we pass over the text node several times,
2348// emitting code for some character positions every time. See the comment on
2349// TextEmitPass for details.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002350void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org32912102009-01-16 10:38:43 +00002351 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002352 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002353 ASSERT(limit_result == CONTINUE);
2354
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002355 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2356 compiler->SetRegExpTooBig();
2357 return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002358 }
2359
2360 if (compiler->ascii()) {
2361 int dummy = 0;
ager@chromium.org32912102009-01-16 10:38:43 +00002362 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002363 }
2364
2365 bool first_elt_done = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002366 int bound_checked_to = trace->cp_offset() - 1;
2367 bound_checked_to += trace->bound_checked_up_to();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002368
2369 // If a character is preloaded into the current character register then
2370 // check that now.
ager@chromium.org32912102009-01-16 10:38:43 +00002371 if (trace->characters_preloaded() == 1) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002372 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2373 if (!SkipPass(pass, compiler->ignore_case())) {
2374 TextEmitPass(compiler,
2375 static_cast<TextEmitPassType>(pass),
2376 true,
2377 trace,
2378 false,
2379 &bound_checked_to);
2380 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002381 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002382 first_elt_done = true;
2383 }
2384
ager@chromium.org381abbb2009-02-25 13:23:22 +00002385 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2386 if (!SkipPass(pass, compiler->ignore_case())) {
2387 TextEmitPass(compiler,
2388 static_cast<TextEmitPassType>(pass),
2389 false,
2390 trace,
2391 first_elt_done,
2392 &bound_checked_to);
2393 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002394 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002395
ager@chromium.org32912102009-01-16 10:38:43 +00002396 Trace successor_trace(*trace);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002397 successor_trace.set_at_start(false);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002398 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002399 RecursionCheck rc(compiler);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002400 on_success()->Emit(compiler, &successor_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002401}
2402
2403
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002404void Trace::InvalidateCurrentCharacter() {
2405 characters_preloaded_ = 0;
2406}
2407
2408
2409void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002410 ASSERT(by > 0);
2411 // We don't have an instruction for shifting the current character register
2412 // down or for using a shifted value for anything so lets just forget that
2413 // we preloaded any characters into it.
2414 characters_preloaded_ = 0;
2415 // Adjust the offsets of the quick check performed information. This
2416 // information is used to find out what we already determined about the
2417 // characters by means of mask and compare.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002418 quick_check_performed_.Advance(by, compiler->ascii());
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002419 cp_offset_ += by;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002420 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2421 compiler->SetRegExpTooBig();
2422 cp_offset_ = 0;
2423 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002424 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002425}
2426
2427
2428void TextNode::MakeCaseIndependent() {
2429 int element_count = elms_->length();
2430 for (int i = 0; i < element_count; i++) {
2431 TextElement elm = elms_->at(i);
2432 if (elm.type == TextElement::CHAR_CLASS) {
2433 RegExpCharacterClass* cc = elm.data.u_char_class;
2434 ZoneList<CharacterRange>* ranges = cc->ranges();
2435 int range_count = ranges->length();
2436 for (int i = 0; i < range_count; i++) {
2437 ranges->at(i).AddCaseEquivalents(ranges);
2438 }
2439 }
2440 }
2441}
2442
2443
ager@chromium.org8bb60582008-12-11 12:02:20 +00002444int TextNode::GreedyLoopTextLength() {
2445 TextElement elm = elms_->at(elms_->length() - 1);
2446 if (elm.type == TextElement::CHAR_CLASS) {
2447 return elm.cp_offset + 1;
2448 } else {
2449 return elm.cp_offset + elm.data.u_atom->data().length();
2450 }
2451}
2452
2453
2454// Finds the fixed match length of a sequence of nodes that goes from
2455// this alternative and back to this choice node. If there are variable
2456// length nodes or other complications in the way then return a sentinel
2457// value indicating that a greedy loop cannot be constructed.
2458int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2459 int length = 0;
2460 RegExpNode* node = alternative->node();
2461 // Later we will generate code for all these text nodes using recursion
2462 // so we have to limit the max number.
2463 int recursion_depth = 0;
2464 while (node != this) {
2465 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2466 return kNodeIsTooComplexForGreedyLoops;
2467 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002468 int node_length = node->GreedyLoopTextLength();
2469 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2470 return kNodeIsTooComplexForGreedyLoops;
2471 }
2472 length += node_length;
2473 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2474 node = seq_node->on_success();
2475 }
2476 return length;
2477}
2478
2479
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002480void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2481 ASSERT_EQ(loop_node_, NULL);
2482 AddAlternative(alt);
2483 loop_node_ = alt.node();
2484}
2485
2486
2487void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2488 ASSERT_EQ(continue_node_, NULL);
2489 AddAlternative(alt);
2490 continue_node_ = alt.node();
2491}
2492
2493
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002494void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002495 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002496 if (trace->stop_node() == this) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002497 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2498 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2499 // Update the counter-based backtracking info on the stack. This is an
2500 // optimization for greedy loops (see below).
ager@chromium.org32912102009-01-16 10:38:43 +00002501 ASSERT(trace->cp_offset() == text_length);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002502 macro_assembler->AdvanceCurrentPosition(text_length);
ager@chromium.org32912102009-01-16 10:38:43 +00002503 macro_assembler->GoTo(trace->loop_label());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002504 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002505 }
ager@chromium.org32912102009-01-16 10:38:43 +00002506 ASSERT(trace->stop_node() == NULL);
2507 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002508 trace->Flush(compiler, this);
2509 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002510 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002511 ChoiceNode::Emit(compiler, trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002512}
2513
2514
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002515int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002516 int preload_characters = EatsAtLeast(4, 0);
ager@chromium.org9085a012009-05-11 19:22:57 +00002517#ifdef V8_HOST_CAN_READ_UNALIGNED
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002518 bool ascii = compiler->ascii();
2519 if (ascii) {
2520 if (preload_characters > 4) preload_characters = 4;
2521 // We can't preload 3 characters because there is no machine instruction
2522 // to do that. We can't just load 4 because we could be reading
2523 // beyond the end of the string, which could cause a memory fault.
2524 if (preload_characters == 3) preload_characters = 2;
2525 } else {
2526 if (preload_characters > 2) preload_characters = 2;
2527 }
2528#else
2529 if (preload_characters > 1) preload_characters = 1;
2530#endif
2531 return preload_characters;
2532}
2533
2534
2535// This class is used when generating the alternatives in a choice node. It
2536// records the way the alternative is being code generated.
2537class AlternativeGeneration: public Malloced {
2538 public:
2539 AlternativeGeneration()
2540 : possible_success(),
2541 expects_preload(false),
2542 after(),
2543 quick_check_details() { }
2544 Label possible_success;
2545 bool expects_preload;
2546 Label after;
2547 QuickCheckDetails quick_check_details;
2548};
2549
2550
2551// Creates a list of AlternativeGenerations. If the list has a reasonable
2552// size then it is on the stack, otherwise the excess is on the heap.
2553class AlternativeGenerationList {
2554 public:
2555 explicit AlternativeGenerationList(int count)
2556 : alt_gens_(count) {
2557 for (int i = 0; i < count && i < kAFew; i++) {
2558 alt_gens_.Add(a_few_alt_gens_ + i);
2559 }
2560 for (int i = kAFew; i < count; i++) {
2561 alt_gens_.Add(new AlternativeGeneration());
2562 }
2563 }
2564 ~AlternativeGenerationList() {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002565 for (int i = kAFew; i < alt_gens_.length(); i++) {
2566 delete alt_gens_[i];
2567 alt_gens_[i] = NULL;
2568 }
2569 }
2570
2571 AlternativeGeneration* at(int i) {
2572 return alt_gens_[i];
2573 }
2574 private:
2575 static const int kAFew = 10;
2576 ZoneList<AlternativeGeneration*> alt_gens_;
2577 AlternativeGeneration a_few_alt_gens_[kAFew];
2578};
2579
2580
2581/* Code generation for choice nodes.
2582 *
2583 * We generate quick checks that do a mask and compare to eliminate a
2584 * choice. If the quick check succeeds then it jumps to the continuation to
2585 * do slow checks and check subsequent nodes. If it fails (the common case)
2586 * it falls through to the next choice.
2587 *
2588 * Here is the desired flow graph. Nodes directly below each other imply
2589 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2590 * 3 doesn't have a quick check so we have to call the slow check.
2591 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2592 * regexp continuation is generated directly after the Sn node, up to the
2593 * next GoTo if we decide to reuse some already generated code. Some
2594 * nodes expect preload_characters to be preloaded into the current
2595 * character register. R nodes do this preloading. Vertices are marked
2596 * F for failures and S for success (possible success in the case of quick
2597 * nodes). L, V, < and > are used as arrow heads.
2598 *
2599 * ----------> R
2600 * |
2601 * V
2602 * Q1 -----> S1
2603 * | S /
2604 * F| /
2605 * | F/
2606 * | /
2607 * | R
2608 * | /
2609 * V L
2610 * Q2 -----> S2
2611 * | S /
2612 * F| /
2613 * | F/
2614 * | /
2615 * | R
2616 * | /
2617 * V L
2618 * S3
2619 * |
2620 * F|
2621 * |
2622 * R
2623 * |
2624 * backtrack V
2625 * <----------Q4
2626 * \ F |
2627 * \ |S
2628 * \ F V
2629 * \-----S4
2630 *
2631 * For greedy loops we reverse our expectation and expect to match rather
2632 * than fail. Therefore we want the loop code to look like this (U is the
2633 * unwind code that steps back in the greedy loop). The following alternatives
2634 * look the same as above.
2635 * _____
2636 * / \
2637 * V |
2638 * ----------> S1 |
2639 * /| |
2640 * / |S |
2641 * F/ \_____/
2642 * /
2643 * |<-----------
2644 * | \
2645 * V \
2646 * Q2 ---> S2 \
2647 * | S / |
2648 * F| / |
2649 * | F/ |
2650 * | / |
2651 * | R |
2652 * | / |
2653 * F VL |
2654 * <------U |
2655 * back |S |
2656 * \______________/
2657 */
2658
2659
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002660void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002661 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2662 int choice_count = alternatives_->length();
2663#ifdef DEBUG
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002664 for (int i = 0; i < choice_count - 1; i++) {
2665 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002666 ZoneList<Guard*>* guards = alternative.guards();
ager@chromium.org8bb60582008-12-11 12:02:20 +00002667 int guard_count = (guards == NULL) ? 0 : guards->length();
2668 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002669 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002670 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002671 }
2672#endif
2673
ager@chromium.org32912102009-01-16 10:38:43 +00002674 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002675 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002676 ASSERT(limit_result == CONTINUE);
2677
ager@chromium.org381abbb2009-02-25 13:23:22 +00002678 int new_flush_budget = trace->flush_budget() / choice_count;
2679 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
2680 trace->Flush(compiler, this);
2681 return;
2682 }
2683
ager@chromium.org8bb60582008-12-11 12:02:20 +00002684 RecursionCheck rc(compiler);
2685
ager@chromium.org32912102009-01-16 10:38:43 +00002686 Trace* current_trace = trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002687
2688 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2689 bool greedy_loop = false;
2690 Label greedy_loop_label;
ager@chromium.org32912102009-01-16 10:38:43 +00002691 Trace counter_backtrack_trace;
2692 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002693 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
2694
ager@chromium.org8bb60582008-12-11 12:02:20 +00002695 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2696 // Here we have special handling for greedy loops containing only text nodes
2697 // and other simple nodes. These are handled by pushing the current
2698 // position on the stack and then incrementing the current position each
2699 // time around the switch. On backtrack we decrement the current position
2700 // and check it against the pushed value. This avoids pushing backtrack
2701 // information for each iteration of the loop, which could take up a lot of
2702 // space.
2703 greedy_loop = true;
ager@chromium.org32912102009-01-16 10:38:43 +00002704 ASSERT(trace->stop_node() == NULL);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002705 macro_assembler->PushCurrentPosition();
ager@chromium.org32912102009-01-16 10:38:43 +00002706 current_trace = &counter_backtrack_trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002707 Label greedy_match_failed;
ager@chromium.org32912102009-01-16 10:38:43 +00002708 Trace greedy_match_trace;
iposva@chromium.org245aa852009-02-10 00:49:54 +00002709 if (not_at_start()) greedy_match_trace.set_at_start(false);
ager@chromium.org32912102009-01-16 10:38:43 +00002710 greedy_match_trace.set_backtrack(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002711 Label loop_label;
2712 macro_assembler->Bind(&loop_label);
ager@chromium.org32912102009-01-16 10:38:43 +00002713 greedy_match_trace.set_stop_node(this);
2714 greedy_match_trace.set_loop_label(&loop_label);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002715 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002716 macro_assembler->Bind(&greedy_match_failed);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002717 }
2718
2719 Label second_choice; // For use in greedy matches.
2720 macro_assembler->Bind(&second_choice);
2721
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002722 int first_normal_choice = greedy_loop ? 1 : 0;
2723
2724 int preload_characters = CalculatePreloadCharacters(compiler);
2725 bool preload_is_current =
ager@chromium.org32912102009-01-16 10:38:43 +00002726 (current_trace->characters_preloaded() == preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002727 bool preload_has_checked_bounds = preload_is_current;
2728
2729 AlternativeGenerationList alt_gens(choice_count);
2730
ager@chromium.org8bb60582008-12-11 12:02:20 +00002731 // For now we just call all choices one after the other. The idea ultimately
2732 // is to use the Dispatch table to try only the relevant ones.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002733 for (int i = first_normal_choice; i < choice_count; i++) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00002734 GuardedAlternative alternative = alternatives_->at(i);
ager@chromium.org32912102009-01-16 10:38:43 +00002735 AlternativeGeneration* alt_gen = alt_gens.at(i);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002736 alt_gen->quick_check_details.set_characters(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002737 ZoneList<Guard*>* guards = alternative.guards();
2738 int guard_count = (guards == NULL) ? 0 : guards->length();
ager@chromium.org32912102009-01-16 10:38:43 +00002739 Trace new_trace(*current_trace);
2740 new_trace.set_characters_preloaded(preload_is_current ?
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002741 preload_characters :
2742 0);
2743 if (preload_has_checked_bounds) {
ager@chromium.org32912102009-01-16 10:38:43 +00002744 new_trace.set_bound_checked_up_to(preload_characters);
ager@chromium.org8bb60582008-12-11 12:02:20 +00002745 }
ager@chromium.org32912102009-01-16 10:38:43 +00002746 new_trace.quick_check_performed()->Clear();
iposva@chromium.org245aa852009-02-10 00:49:54 +00002747 if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002748 alt_gen->expects_preload = preload_is_current;
2749 bool generate_full_check_inline = false;
ager@chromium.org381abbb2009-02-25 13:23:22 +00002750 if (FLAG_regexp_optimization &&
iposva@chromium.org245aa852009-02-10 00:49:54 +00002751 try_to_emit_quick_check_for_alternative(i) &&
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002752 alternative.node()->EmitQuickCheck(compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002753 &new_trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002754 preload_has_checked_bounds,
2755 &alt_gen->possible_success,
2756 &alt_gen->quick_check_details,
2757 i < choice_count - 1)) {
2758 // Quick check was generated for this choice.
2759 preload_is_current = true;
2760 preload_has_checked_bounds = true;
2761 // On the last choice in the ChoiceNode we generated the quick
2762 // check to fall through on possible success. So now we need to
2763 // generate the full check inline.
2764 if (i == choice_count - 1) {
2765 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00002766 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2767 new_trace.set_characters_preloaded(preload_characters);
2768 new_trace.set_bound_checked_up_to(preload_characters);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002769 generate_full_check_inline = true;
2770 }
iposva@chromium.org245aa852009-02-10 00:49:54 +00002771 } else if (alt_gen->quick_check_details.cannot_match()) {
2772 if (i == choice_count - 1 && !greedy_loop) {
2773 macro_assembler->GoTo(trace->backtrack());
2774 }
2775 continue;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002776 } else {
2777 // No quick check was generated. Put the full code here.
2778 // If this is not the first choice then there could be slow checks from
2779 // previous cases that go here when they fail. There's no reason to
2780 // insist that they preload characters since the slow check we are about
2781 // to generate probably can't use it.
2782 if (i != first_normal_choice) {
2783 alt_gen->expects_preload = false;
ager@chromium.org32912102009-01-16 10:38:43 +00002784 new_trace.set_characters_preloaded(0);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002785 }
2786 if (i < choice_count - 1) {
ager@chromium.org32912102009-01-16 10:38:43 +00002787 new_trace.set_backtrack(&alt_gen->after);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002788 }
2789 generate_full_check_inline = true;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002790 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002791 if (generate_full_check_inline) {
ager@chromium.org381abbb2009-02-25 13:23:22 +00002792 if (new_trace.actions() != NULL) {
2793 new_trace.set_flush_budget(new_flush_budget);
2794 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002795 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002796 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002797 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002798 alternative.node()->Emit(compiler, &new_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002799 preload_is_current = false;
2800 }
2801 macro_assembler->Bind(&alt_gen->after);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002802 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002803 if (greedy_loop) {
2804 macro_assembler->Bind(&greedy_loop_label);
2805 // If we have unwound to the bottom then backtrack.
ager@chromium.org32912102009-01-16 10:38:43 +00002806 macro_assembler->CheckGreedyLoop(trace->backtrack());
ager@chromium.org8bb60582008-12-11 12:02:20 +00002807 // Otherwise try the second priority at an earlier position.
2808 macro_assembler->AdvanceCurrentPosition(-text_length);
2809 macro_assembler->GoTo(&second_choice);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002810 }
ager@chromium.org381abbb2009-02-25 13:23:22 +00002811
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002812 // At this point we need to generate slow checks for the alternatives where
2813 // the quick check was inlined. We can recognize these because the associated
2814 // label was bound.
2815 for (int i = first_normal_choice; i < choice_count - 1; i++) {
2816 AlternativeGeneration* alt_gen = alt_gens.at(i);
ager@chromium.org381abbb2009-02-25 13:23:22 +00002817 Trace new_trace(*current_trace);
2818 // If there are actions to be flushed we have to limit how many times
2819 // they are flushed. Take the budget of the parent trace and distribute
2820 // it fairly amongst the children.
2821 if (new_trace.actions() != NULL) {
2822 new_trace.set_flush_budget(new_flush_budget);
2823 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002824 EmitOutOfLineContinuation(compiler,
ager@chromium.org381abbb2009-02-25 13:23:22 +00002825 &new_trace,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002826 alternatives_->at(i),
2827 alt_gen,
2828 preload_characters,
2829 alt_gens.at(i + 1)->expects_preload);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002830 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002831}
2832
2833
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002834void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +00002835 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002836 GuardedAlternative alternative,
2837 AlternativeGeneration* alt_gen,
2838 int preload_characters,
2839 bool next_expects_preload) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002840 if (!alt_gen->possible_success.is_linked()) return;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002841
2842 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2843 macro_assembler->Bind(&alt_gen->possible_success);
ager@chromium.org32912102009-01-16 10:38:43 +00002844 Trace out_of_line_trace(*trace);
2845 out_of_line_trace.set_characters_preloaded(preload_characters);
2846 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
iposva@chromium.org245aa852009-02-10 00:49:54 +00002847 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002848 ZoneList<Guard*>* guards = alternative.guards();
2849 int guard_count = (guards == NULL) ? 0 : guards->length();
2850 if (next_expects_preload) {
2851 Label reload_current_char;
ager@chromium.org32912102009-01-16 10:38:43 +00002852 out_of_line_trace.set_backtrack(&reload_current_char);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002853 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002854 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002855 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002856 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002857 macro_assembler->Bind(&reload_current_char);
2858 // Reload the current character, since the next quick check expects that.
2859 // We don't need to check bounds here because we only get into this
2860 // code through a quick check which already did the checked load.
ager@chromium.org32912102009-01-16 10:38:43 +00002861 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002862 NULL,
2863 false,
2864 preload_characters);
2865 macro_assembler->GoTo(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002866 } else {
ager@chromium.org32912102009-01-16 10:38:43 +00002867 out_of_line_trace.set_backtrack(&(alt_gen->after));
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002868 for (int j = 0; j < guard_count; j++) {
ager@chromium.org32912102009-01-16 10:38:43 +00002869 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002870 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002871 alternative.node()->Emit(compiler, &out_of_line_trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002872 }
2873}
2874
2875
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002876void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002877 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00002878 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002879 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002880 ASSERT(limit_result == CONTINUE);
2881
2882 RecursionCheck rc(compiler);
2883
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002884 switch (type_) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002885 case STORE_POSITION: {
ager@chromium.org32912102009-01-16 10:38:43 +00002886 Trace::DeferredCapture
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002887 new_capture(data_.u_position_register.reg,
2888 data_.u_position_register.is_capture,
2889 trace);
ager@chromium.org32912102009-01-16 10:38:43 +00002890 Trace new_trace = *trace;
2891 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002892 on_success()->Emit(compiler, &new_trace);
2893 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002894 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00002895 case INCREMENT_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00002896 Trace::DeferredIncrementRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00002897 new_increment(data_.u_increment_register.reg);
ager@chromium.org32912102009-01-16 10:38:43 +00002898 Trace new_trace = *trace;
2899 new_trace.add_action(&new_increment);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002900 on_success()->Emit(compiler, &new_trace);
2901 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002902 }
2903 case SET_REGISTER: {
ager@chromium.org32912102009-01-16 10:38:43 +00002904 Trace::DeferredSetRegister
ager@chromium.org8bb60582008-12-11 12:02:20 +00002905 new_set(data_.u_store_register.reg, data_.u_store_register.value);
ager@chromium.org32912102009-01-16 10:38:43 +00002906 Trace new_trace = *trace;
2907 new_trace.add_action(&new_set);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002908 on_success()->Emit(compiler, &new_trace);
2909 break;
ager@chromium.org32912102009-01-16 10:38:43 +00002910 }
2911 case CLEAR_CAPTURES: {
2912 Trace::DeferredClearCaptures
2913 new_capture(Interval(data_.u_clear_captures.range_from,
2914 data_.u_clear_captures.range_to));
2915 Trace new_trace = *trace;
2916 new_trace.add_action(&new_capture);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002917 on_success()->Emit(compiler, &new_trace);
2918 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00002919 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002920 case BEGIN_SUBMATCH:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002921 if (!trace->is_trivial()) {
2922 trace->Flush(compiler, this);
2923 } else {
2924 assembler->WriteCurrentPositionToRegister(
2925 data_.u_submatch.current_position_register, 0);
2926 assembler->WriteStackPointerToRegister(
2927 data_.u_submatch.stack_pointer_register);
2928 on_success()->Emit(compiler, trace);
2929 }
2930 break;
ager@chromium.org32912102009-01-16 10:38:43 +00002931 case EMPTY_MATCH_CHECK: {
2932 int start_pos_reg = data_.u_empty_match_check.start_register;
2933 int stored_pos = 0;
2934 int rep_reg = data_.u_empty_match_check.repetition_register;
2935 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
2936 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
2937 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
2938 // If we know we haven't advanced and there is no minimum we
2939 // can just backtrack immediately.
2940 assembler->GoTo(trace->backtrack());
ager@chromium.org32912102009-01-16 10:38:43 +00002941 } else if (know_dist && stored_pos < trace->cp_offset()) {
2942 // If we know we've advanced we can generate the continuation
2943 // immediately.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002944 on_success()->Emit(compiler, trace);
2945 } else if (!trace->is_trivial()) {
2946 trace->Flush(compiler, this);
2947 } else {
2948 Label skip_empty_check;
2949 // If we have a minimum number of repetitions we check the current
2950 // number first and skip the empty check if it's not enough.
2951 if (has_minimum) {
2952 int limit = data_.u_empty_match_check.repetition_limit;
2953 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
2954 }
2955 // If the match is empty we bail out, otherwise we fall through
2956 // to the on-success continuation.
2957 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
2958 trace->backtrack());
2959 assembler->Bind(&skip_empty_check);
2960 on_success()->Emit(compiler, trace);
ager@chromium.org32912102009-01-16 10:38:43 +00002961 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002962 break;
ager@chromium.org32912102009-01-16 10:38:43 +00002963 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002964 case POSITIVE_SUBMATCH_SUCCESS: {
2965 if (!trace->is_trivial()) {
2966 trace->Flush(compiler, this);
2967 return;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002968 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002969 assembler->ReadCurrentPositionFromRegister(
ager@chromium.org8bb60582008-12-11 12:02:20 +00002970 data_.u_submatch.current_position_register);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002971 assembler->ReadStackPointerFromRegister(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002972 data_.u_submatch.stack_pointer_register);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002973 int clear_register_count = data_.u_submatch.clear_register_count;
2974 if (clear_register_count == 0) {
2975 on_success()->Emit(compiler, trace);
2976 return;
2977 }
2978 int clear_registers_from = data_.u_submatch.clear_register_from;
2979 Label clear_registers_backtrack;
2980 Trace new_trace = *trace;
2981 new_trace.set_backtrack(&clear_registers_backtrack);
2982 on_success()->Emit(compiler, &new_trace);
2983
2984 assembler->Bind(&clear_registers_backtrack);
2985 int clear_registers_to = clear_registers_from + clear_register_count - 1;
2986 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
2987
2988 ASSERT(trace->backtrack() == NULL);
2989 assembler->Backtrack();
2990 return;
2991 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002992 default:
2993 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002994 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00002995}
2996
2997
ager@chromium.orgddb913d2009-01-27 10:01:48 +00002998void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00002999 RegExpMacroAssembler* assembler = compiler->macro_assembler();
ager@chromium.org32912102009-01-16 10:38:43 +00003000 if (!trace->is_trivial()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003001 trace->Flush(compiler, this);
3002 return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003003 }
3004
ager@chromium.org32912102009-01-16 10:38:43 +00003005 LimitResult limit_result = LimitVersions(compiler, trace);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003006 if (limit_result == DONE) return;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003007 ASSERT(limit_result == CONTINUE);
3008
3009 RecursionCheck rc(compiler);
3010
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003011 ASSERT_EQ(start_reg_ + 1, end_reg_);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003012 if (compiler->ignore_case()) {
3013 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3014 trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003015 } else {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003016 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003017 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003018 on_success()->Emit(compiler, trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003019}
3020
3021
3022// -------------------------------------------------------------------
3023// Dot/dotty output
3024
3025
3026#ifdef DEBUG
3027
3028
3029class DotPrinter: public NodeVisitor {
3030 public:
3031 explicit DotPrinter(bool ignore_case)
3032 : ignore_case_(ignore_case),
3033 stream_(&alloc_) { }
3034 void PrintNode(const char* label, RegExpNode* node);
3035 void Visit(RegExpNode* node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003036 void PrintAttributes(RegExpNode* from);
3037 StringStream* stream() { return &stream_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003038 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003039#define DECLARE_VISIT(Type) \
3040 virtual void Visit##Type(Type##Node* that);
3041FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3042#undef DECLARE_VISIT
3043 private:
3044 bool ignore_case_;
3045 HeapStringAllocator alloc_;
3046 StringStream stream_;
3047};
3048
3049
3050void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3051 stream()->Add("digraph G {\n graph [label=\"");
3052 for (int i = 0; label[i]; i++) {
3053 switch (label[i]) {
3054 case '\\':
3055 stream()->Add("\\\\");
3056 break;
3057 case '"':
3058 stream()->Add("\"");
3059 break;
3060 default:
3061 stream()->Put(label[i]);
3062 break;
3063 }
3064 }
3065 stream()->Add("\"];\n");
3066 Visit(node);
3067 stream()->Add("}\n");
3068 printf("%s", *(stream()->ToCString()));
3069}
3070
3071
3072void DotPrinter::Visit(RegExpNode* node) {
3073 if (node->info()->visited) return;
3074 node->info()->visited = true;
3075 node->Accept(this);
3076}
3077
3078
3079void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003080 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3081 Visit(on_failure);
3082}
3083
3084
3085class TableEntryBodyPrinter {
3086 public:
3087 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3088 : stream_(stream), choice_(choice) { }
3089 void Call(uc16 from, DispatchTable::Entry entry) {
3090 OutSet* out_set = entry.out_set();
3091 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3092 if (out_set->Get(i)) {
3093 stream()->Add(" n%p:s%io%i -> n%p;\n",
3094 choice(),
3095 from,
3096 i,
3097 choice()->alternatives()->at(i).node());
3098 }
3099 }
3100 }
3101 private:
3102 StringStream* stream() { return stream_; }
3103 ChoiceNode* choice() { return choice_; }
3104 StringStream* stream_;
3105 ChoiceNode* choice_;
3106};
3107
3108
3109class TableEntryHeaderPrinter {
3110 public:
3111 explicit TableEntryHeaderPrinter(StringStream* stream)
3112 : first_(true), stream_(stream) { }
3113 void Call(uc16 from, DispatchTable::Entry entry) {
3114 if (first_) {
3115 first_ = false;
3116 } else {
3117 stream()->Add("|");
3118 }
3119 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3120 OutSet* out_set = entry.out_set();
3121 int priority = 0;
3122 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3123 if (out_set->Get(i)) {
3124 if (priority > 0) stream()->Add("|");
3125 stream()->Add("<s%io%i> %i", from, i, priority);
3126 priority++;
3127 }
3128 }
3129 stream()->Add("}}");
3130 }
3131 private:
3132 bool first_;
3133 StringStream* stream() { return stream_; }
3134 StringStream* stream_;
3135};
3136
3137
3138class AttributePrinter {
3139 public:
3140 explicit AttributePrinter(DotPrinter* out)
3141 : out_(out), first_(true) { }
3142 void PrintSeparator() {
3143 if (first_) {
3144 first_ = false;
3145 } else {
3146 out_->stream()->Add("|");
3147 }
3148 }
3149 void PrintBit(const char* name, bool value) {
3150 if (!value) return;
3151 PrintSeparator();
3152 out_->stream()->Add("{%s}", name);
3153 }
3154 void PrintPositive(const char* name, int value) {
3155 if (value < 0) return;
3156 PrintSeparator();
3157 out_->stream()->Add("{%s|%x}", name, value);
3158 }
3159 private:
3160 DotPrinter* out_;
3161 bool first_;
3162};
3163
3164
3165void DotPrinter::PrintAttributes(RegExpNode* that) {
3166 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3167 "margin=0.1, fontsize=10, label=\"{",
3168 that);
3169 AttributePrinter printer(this);
3170 NodeInfo* info = that->info();
3171 printer.PrintBit("NI", info->follows_newline_interest);
3172 printer.PrintBit("WI", info->follows_word_interest);
3173 printer.PrintBit("SI", info->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003174 Label* label = that->label();
3175 if (label->is_bound())
3176 printer.PrintPositive("@", label->pos());
3177 stream()->Add("}\"];\n");
3178 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3179 "arrowhead=none];\n", that, that);
3180}
3181
3182
3183static const bool kPrintDispatchTable = false;
3184void DotPrinter::VisitChoice(ChoiceNode* that) {
3185 if (kPrintDispatchTable) {
3186 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3187 TableEntryHeaderPrinter header_printer(stream());
3188 that->GetTable(ignore_case_)->ForEach(&header_printer);
3189 stream()->Add("\"]\n", that);
3190 PrintAttributes(that);
3191 TableEntryBodyPrinter body_printer(stream(), that);
3192 that->GetTable(ignore_case_)->ForEach(&body_printer);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003193 } else {
3194 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3195 for (int i = 0; i < that->alternatives()->length(); i++) {
3196 GuardedAlternative alt = that->alternatives()->at(i);
3197 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3198 }
3199 }
3200 for (int i = 0; i < that->alternatives()->length(); i++) {
3201 GuardedAlternative alt = that->alternatives()->at(i);
3202 alt.node()->Accept(this);
3203 }
3204}
3205
3206
3207void DotPrinter::VisitText(TextNode* that) {
3208 stream()->Add(" n%p [label=\"", that);
3209 for (int i = 0; i < that->elements()->length(); i++) {
3210 if (i > 0) stream()->Add(" ");
3211 TextElement elm = that->elements()->at(i);
3212 switch (elm.type) {
3213 case TextElement::ATOM: {
3214 stream()->Add("'%w'", elm.data.u_atom->data());
3215 break;
3216 }
3217 case TextElement::CHAR_CLASS: {
3218 RegExpCharacterClass* node = elm.data.u_char_class;
3219 stream()->Add("[");
3220 if (node->is_negated())
3221 stream()->Add("^");
3222 for (int j = 0; j < node->ranges()->length(); j++) {
3223 CharacterRange range = node->ranges()->at(j);
3224 stream()->Add("%k-%k", range.from(), range.to());
3225 }
3226 stream()->Add("]");
3227 break;
3228 }
3229 default:
3230 UNREACHABLE();
3231 }
3232 }
3233 stream()->Add("\", shape=box, peripheries=2];\n");
3234 PrintAttributes(that);
3235 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3236 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003237}
3238
3239
3240void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3241 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3242 that,
3243 that->start_register(),
3244 that->end_register());
3245 PrintAttributes(that);
3246 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3247 Visit(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003248}
3249
3250
3251void DotPrinter::VisitEnd(EndNode* that) {
3252 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3253 PrintAttributes(that);
3254}
3255
3256
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003257void DotPrinter::VisitAssertion(AssertionNode* that) {
3258 stream()->Add(" n%p [", that);
3259 switch (that->type()) {
3260 case AssertionNode::AT_END:
3261 stream()->Add("label=\"$\", shape=septagon");
3262 break;
3263 case AssertionNode::AT_START:
3264 stream()->Add("label=\"^\", shape=septagon");
3265 break;
3266 case AssertionNode::AT_BOUNDARY:
3267 stream()->Add("label=\"\\b\", shape=septagon");
3268 break;
3269 case AssertionNode::AT_NON_BOUNDARY:
3270 stream()->Add("label=\"\\B\", shape=septagon");
3271 break;
3272 case AssertionNode::AFTER_NEWLINE:
3273 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3274 break;
3275 }
3276 stream()->Add("];\n");
3277 PrintAttributes(that);
3278 RegExpNode* successor = that->on_success();
3279 stream()->Add(" n%p -> n%p;\n", that, successor);
3280 Visit(successor);
3281}
3282
3283
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003284void DotPrinter::VisitAction(ActionNode* that) {
3285 stream()->Add(" n%p [", that);
3286 switch (that->type_) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003287 case ActionNode::SET_REGISTER:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003288 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3289 that->data_.u_store_register.reg,
3290 that->data_.u_store_register.value);
3291 break;
3292 case ActionNode::INCREMENT_REGISTER:
3293 stream()->Add("label=\"$%i++\", shape=octagon",
3294 that->data_.u_increment_register.reg);
3295 break;
3296 case ActionNode::STORE_POSITION:
3297 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3298 that->data_.u_position_register.reg);
3299 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003300 case ActionNode::BEGIN_SUBMATCH:
3301 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3302 that->data_.u_submatch.current_position_register);
3303 break;
ager@chromium.org8bb60582008-12-11 12:02:20 +00003304 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003305 stream()->Add("label=\"escape\", shape=septagon");
3306 break;
ager@chromium.org32912102009-01-16 10:38:43 +00003307 case ActionNode::EMPTY_MATCH_CHECK:
3308 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3309 that->data_.u_empty_match_check.start_register,
3310 that->data_.u_empty_match_check.repetition_register,
3311 that->data_.u_empty_match_check.repetition_limit);
3312 break;
3313 case ActionNode::CLEAR_CAPTURES: {
3314 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3315 that->data_.u_clear_captures.range_from,
3316 that->data_.u_clear_captures.range_to);
3317 break;
3318 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003319 }
3320 stream()->Add("];\n");
3321 PrintAttributes(that);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003322 RegExpNode* successor = that->on_success();
3323 stream()->Add(" n%p -> n%p;\n", that, successor);
3324 Visit(successor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003325}
3326
3327
3328class DispatchTableDumper {
3329 public:
3330 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3331 void Call(uc16 key, DispatchTable::Entry entry);
3332 StringStream* stream() { return stream_; }
3333 private:
3334 StringStream* stream_;
3335};
3336
3337
3338void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3339 stream()->Add("[%k-%k]: {", key, entry.to());
3340 OutSet* set = entry.out_set();
3341 bool first = true;
3342 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3343 if (set->Get(i)) {
3344 if (first) {
3345 first = false;
3346 } else {
3347 stream()->Add(", ");
3348 }
3349 stream()->Add("%i", i);
3350 }
3351 }
3352 stream()->Add("}\n");
3353}
3354
3355
3356void DispatchTable::Dump() {
3357 HeapStringAllocator alloc;
3358 StringStream stream(&alloc);
3359 DispatchTableDumper dumper(&stream);
3360 tree()->ForEach(&dumper);
3361 OS::PrintError("%s", *stream.ToCString());
3362}
3363
3364
3365void RegExpEngine::DotPrint(const char* label,
3366 RegExpNode* node,
3367 bool ignore_case) {
3368 DotPrinter printer(ignore_case);
3369 printer.PrintNode(label, node);
3370}
3371
3372
3373#endif // DEBUG
3374
3375
3376// -------------------------------------------------------------------
3377// Tree to graph conversion
3378
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003379static const int kSpaceRangeCount = 20;
3380static const int kSpaceRangeAsciiCount = 4;
3381static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3382 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3383 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3384
3385static const int kWordRangeCount = 8;
3386static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3387 '_', 'a', 'z' };
3388
3389static const int kDigitRangeCount = 2;
3390static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3391
3392static const int kLineTerminatorRangeCount = 6;
3393static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3394 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003395
3396RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003397 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003398 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3399 elms->Add(TextElement::Atom(this));
ager@chromium.org8bb60582008-12-11 12:02:20 +00003400 return new TextNode(elms, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003401}
3402
3403
3404RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003405 RegExpNode* on_success) {
3406 return new TextNode(elements(), on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003407}
3408
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003409static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3410 const uc16* special_class,
3411 int length) {
3412 ASSERT(ranges->length() != 0);
3413 ASSERT(length != 0);
3414 ASSERT(special_class[0] != 0);
3415 if (ranges->length() != (length >> 1) + 1) {
3416 return false;
3417 }
3418 CharacterRange range = ranges->at(0);
3419 if (range.from() != 0) {
3420 return false;
3421 }
3422 for (int i = 0; i < length; i += 2) {
3423 if (special_class[i] != (range.to() + 1)) {
3424 return false;
3425 }
3426 range = ranges->at((i >> 1) + 1);
3427 if (special_class[i+1] != range.from() - 1) {
3428 return false;
3429 }
3430 }
3431 if (range.to() != 0xffff) {
3432 return false;
3433 }
3434 return true;
3435}
3436
3437
3438static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3439 const uc16* special_class,
3440 int length) {
3441 if (ranges->length() * 2 != length) {
3442 return false;
3443 }
3444 for (int i = 0; i < length; i += 2) {
3445 CharacterRange range = ranges->at(i >> 1);
3446 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3447 return false;
3448 }
3449 }
3450 return true;
3451}
3452
3453
3454bool RegExpCharacterClass::is_standard() {
3455 // TODO(lrn): Remove need for this function, by not throwing away information
3456 // along the way.
3457 if (is_negated_) {
3458 return false;
3459 }
3460 if (set_.is_standard()) {
3461 return true;
3462 }
3463 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3464 set_.set_standard_set_type('s');
3465 return true;
3466 }
3467 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3468 set_.set_standard_set_type('S');
3469 return true;
3470 }
3471 if (CompareInverseRanges(set_.ranges(),
3472 kLineTerminatorRanges,
3473 kLineTerminatorRangeCount)) {
3474 set_.set_standard_set_type('.');
3475 return true;
3476 }
3477 return false;
3478}
3479
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003480
3481RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003482 RegExpNode* on_success) {
3483 return new TextNode(this, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003484}
3485
3486
3487RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003488 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003489 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3490 int length = alternatives->length();
ager@chromium.org8bb60582008-12-11 12:02:20 +00003491 ChoiceNode* result = new ChoiceNode(length);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003492 for (int i = 0; i < length; i++) {
3493 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003494 on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003495 result->AddAlternative(alternative);
3496 }
3497 return result;
3498}
3499
3500
3501RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003502 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003503 return ToNode(min(),
3504 max(),
3505 is_greedy(),
3506 body(),
3507 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003508 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003509}
3510
3511
3512RegExpNode* RegExpQuantifier::ToNode(int min,
3513 int max,
3514 bool is_greedy,
3515 RegExpTree* body,
3516 RegExpCompiler* compiler,
iposva@chromium.org245aa852009-02-10 00:49:54 +00003517 RegExpNode* on_success,
3518 bool not_at_start) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003519 // x{f, t} becomes this:
3520 //
3521 // (r++)<-.
3522 // | `
3523 // | (x)
3524 // v ^
3525 // (r=0)-->(?)---/ [if r < t]
3526 // |
3527 // [if r >= f] \----> ...
3528 //
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003529
3530 // 15.10.2.5 RepeatMatcher algorithm.
3531 // The parser has already eliminated the case where max is 0. In the case
3532 // where max_match is zero the parser has removed the quantifier if min was
3533 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3534
3535 // If we know that we cannot match zero length then things are a little
3536 // simpler since we don't need to make the special zero length match check
3537 // from step 2.1. If the min and max are small we can unroll a little in
3538 // this case.
3539 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3540 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3541 if (max == 0) return on_success; // This can happen due to recursion.
ager@chromium.org32912102009-01-16 10:38:43 +00003542 bool body_can_be_empty = (body->min_match() == 0);
3543 int body_start_reg = RegExpCompiler::kNoRegister;
3544 Interval capture_registers = body->CaptureRegisters();
3545 bool needs_capture_clearing = !capture_registers.is_empty();
3546 if (body_can_be_empty) {
3547 body_start_reg = compiler->AllocateRegister();
ager@chromium.org381abbb2009-02-25 13:23:22 +00003548 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
ager@chromium.org32912102009-01-16 10:38:43 +00003549 // Only unroll if there are no captures and the body can't be
3550 // empty.
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003551 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3552 int new_max = (max == kInfinity) ? max : max - min;
3553 // Recurse once to get the loop or optional matches after the fixed ones.
iposva@chromium.org245aa852009-02-10 00:49:54 +00003554 RegExpNode* answer = ToNode(
3555 0, new_max, is_greedy, body, compiler, on_success, true);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003556 // Unroll the forced matches from 0 to min. This can cause chains of
3557 // TextNodes (which the parser does not generate). These should be
3558 // combined if it turns out they hinder good code generation.
3559 for (int i = 0; i < min; i++) {
3560 answer = body->ToNode(compiler, answer);
3561 }
3562 return answer;
3563 }
3564 if (max <= kMaxUnrolledMaxMatches) {
3565 ASSERT(min == 0);
3566 // Unroll the optional matches up to max.
3567 RegExpNode* answer = on_success;
3568 for (int i = 0; i < max; i++) {
3569 ChoiceNode* alternation = new ChoiceNode(2);
3570 if (is_greedy) {
3571 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3572 answer)));
3573 alternation->AddAlternative(GuardedAlternative(on_success));
3574 } else {
3575 alternation->AddAlternative(GuardedAlternative(on_success));
3576 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3577 answer)));
3578 }
3579 answer = alternation;
iposva@chromium.org245aa852009-02-10 00:49:54 +00003580 if (not_at_start) alternation->set_not_at_start();
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003581 }
3582 return answer;
3583 }
3584 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003585 bool has_min = min > 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003586 bool has_max = max < RegExpTree::kInfinity;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003587 bool needs_counter = has_min || has_max;
ager@chromium.org32912102009-01-16 10:38:43 +00003588 int reg_ctr = needs_counter
3589 ? compiler->AllocateRegister()
3590 : RegExpCompiler::kNoRegister;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003591 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
iposva@chromium.org245aa852009-02-10 00:49:54 +00003592 if (not_at_start) center->set_not_at_start();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003593 RegExpNode* loop_return = needs_counter
3594 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3595 : static_cast<RegExpNode*>(center);
ager@chromium.org32912102009-01-16 10:38:43 +00003596 if (body_can_be_empty) {
3597 // If the body can be empty we need to check if it was and then
3598 // backtrack.
3599 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3600 reg_ctr,
3601 min,
3602 loop_return);
3603 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003604 RegExpNode* body_node = body->ToNode(compiler, loop_return);
ager@chromium.org32912102009-01-16 10:38:43 +00003605 if (body_can_be_empty) {
3606 // If the body can be empty we need to store the start position
3607 // so we can bail out if it was empty.
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003608 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
ager@chromium.org32912102009-01-16 10:38:43 +00003609 }
3610 if (needs_capture_clearing) {
3611 // Before entering the body of this loop we need to clear captures.
3612 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3613 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003614 GuardedAlternative body_alt(body_node);
3615 if (has_max) {
3616 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3617 body_alt.AddGuard(body_guard);
3618 }
3619 GuardedAlternative rest_alt(on_success);
3620 if (has_min) {
3621 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3622 rest_alt.AddGuard(rest_guard);
3623 }
3624 if (is_greedy) {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003625 center->AddLoopAlternative(body_alt);
3626 center->AddContinueAlternative(rest_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003627 } else {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003628 center->AddContinueAlternative(rest_alt);
3629 center->AddLoopAlternative(body_alt);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003630 }
3631 if (needs_counter) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003632 return ActionNode::SetRegister(reg_ctr, 0, center);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003633 } else {
3634 return center;
3635 }
3636}
3637
3638
3639RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003640 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003641 NodeInfo info;
3642 switch (type()) {
3643 case START_OF_LINE:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003644 return AssertionNode::AfterNewline(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003645 case START_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003646 return AssertionNode::AtStart(on_success);
3647 case BOUNDARY:
3648 return AssertionNode::AtBoundary(on_success);
3649 case NON_BOUNDARY:
3650 return AssertionNode::AtNonBoundary(on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003651 case END_OF_INPUT:
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003652 return AssertionNode::AtEnd(on_success);
3653 case END_OF_LINE: {
3654 // Compile $ in multiline regexps as an alternation with a positive
3655 // lookahead in one side and an end-of-input on the other side.
3656 // We need two registers for the lookahead.
3657 int stack_pointer_register = compiler->AllocateRegister();
3658 int position_register = compiler->AllocateRegister();
3659 // The ChoiceNode to distinguish between a newline and end-of-input.
3660 ChoiceNode* result = new ChoiceNode(2);
3661 // Create a newline atom.
3662 ZoneList<CharacterRange>* newline_ranges =
3663 new ZoneList<CharacterRange>(3);
3664 CharacterRange::AddClassEscape('n', newline_ranges);
3665 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
3666 TextNode* newline_matcher = new TextNode(
3667 newline_atom,
3668 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3669 position_register,
3670 0, // No captures inside.
3671 -1, // Ignored if no captures.
3672 on_success));
3673 // Create an end-of-input matcher.
3674 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
3675 stack_pointer_register,
3676 position_register,
3677 newline_matcher);
3678 // Add the two alternatives to the ChoiceNode.
3679 GuardedAlternative eol_alternative(end_of_line);
3680 result->AddAlternative(eol_alternative);
3681 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
3682 result->AddAlternative(end_alternative);
3683 return result;
3684 }
3685 default:
3686 UNREACHABLE();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003687 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003688 return on_success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003689}
3690
3691
3692RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003693 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003694 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3695 RegExpCapture::EndRegister(index()),
ager@chromium.org8bb60582008-12-11 12:02:20 +00003696 on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003697}
3698
3699
3700RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003701 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003702 return on_success;
3703}
3704
3705
3706RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003707 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003708 int stack_pointer_register = compiler->AllocateRegister();
3709 int position_register = compiler->AllocateRegister();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003710
3711 const int registers_per_capture = 2;
3712 const int register_of_first_capture = 2;
3713 int register_count = capture_count_ * registers_per_capture;
3714 int register_start =
3715 register_of_first_capture + capture_from_ * registers_per_capture;
3716
ager@chromium.org8bb60582008-12-11 12:02:20 +00003717 RegExpNode* success;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003718 if (is_positive()) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003719 RegExpNode* node = ActionNode::BeginSubmatch(
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003720 stack_pointer_register,
3721 position_register,
3722 body()->ToNode(
3723 compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003724 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3725 position_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003726 register_count,
3727 register_start,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003728 on_success)));
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003729 return node;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003730 } else {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003731 // We use a ChoiceNode for a negative lookahead because it has most of
3732 // the characteristics we need. It has the body of the lookahead as its
3733 // first alternative and the expression after the lookahead of the second
3734 // alternative. If the first alternative succeeds then the
3735 // NegativeSubmatchSuccess will unwind the stack including everything the
3736 // choice node set up and backtrack. If the first alternative fails then
3737 // the second alternative is tried, which is exactly the desired result
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003738 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
3739 // ChoiceNode that knows to ignore the first exit when calculating quick
3740 // checks.
ager@chromium.org8bb60582008-12-11 12:02:20 +00003741 GuardedAlternative body_alt(
3742 body()->ToNode(
3743 compiler,
3744 success = new NegativeSubmatchSuccess(stack_pointer_register,
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003745 position_register,
3746 register_count,
3747 register_start)));
3748 ChoiceNode* choice_node =
3749 new NegativeLookaheadChoiceNode(body_alt,
3750 GuardedAlternative(on_success));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003751 return ActionNode::BeginSubmatch(stack_pointer_register,
3752 position_register,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003753 choice_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003754 }
3755}
3756
3757
3758RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003759 RegExpNode* on_success) {
3760 return ToNode(body(), index(), compiler, on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003761}
3762
3763
3764RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3765 int index,
3766 RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003767 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003768 int start_reg = RegExpCapture::StartRegister(index);
3769 int end_reg = RegExpCapture::EndRegister(index);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003770 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003771 RegExpNode* body_node = body->ToNode(compiler, store_end);
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003772 return ActionNode::StorePosition(start_reg, true, body_node);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003773}
3774
3775
3776RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00003777 RegExpNode* on_success) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003778 ZoneList<RegExpTree*>* children = nodes();
3779 RegExpNode* current = on_success;
3780 for (int i = children->length() - 1; i >= 0; i--) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00003781 current = children->at(i)->ToNode(compiler, current);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003782 }
3783 return current;
3784}
3785
3786
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003787static void AddClass(const uc16* elmv,
3788 int elmc,
3789 ZoneList<CharacterRange>* ranges) {
3790 for (int i = 0; i < elmc; i += 2) {
3791 ASSERT(elmv[i] <= elmv[i + 1]);
3792 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3793 }
3794}
3795
3796
3797static void AddClassNegated(const uc16 *elmv,
3798 int elmc,
3799 ZoneList<CharacterRange>* ranges) {
3800 ASSERT(elmv[0] != 0x0000);
ager@chromium.org8bb60582008-12-11 12:02:20 +00003801 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003802 uc16 last = 0x0000;
3803 for (int i = 0; i < elmc; i += 2) {
3804 ASSERT(last <= elmv[i] - 1);
3805 ASSERT(elmv[i] <= elmv[i + 1]);
3806 ranges->Add(CharacterRange(last, elmv[i] - 1));
3807 last = elmv[i + 1] + 1;
3808 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00003809 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003810}
3811
3812
3813void CharacterRange::AddClassEscape(uc16 type,
3814 ZoneList<CharacterRange>* ranges) {
3815 switch (type) {
3816 case 's':
3817 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3818 break;
3819 case 'S':
3820 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3821 break;
3822 case 'w':
3823 AddClass(kWordRanges, kWordRangeCount, ranges);
3824 break;
3825 case 'W':
3826 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3827 break;
3828 case 'd':
3829 AddClass(kDigitRanges, kDigitRangeCount, ranges);
3830 break;
3831 case 'D':
3832 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3833 break;
3834 case '.':
3835 AddClassNegated(kLineTerminatorRanges,
3836 kLineTerminatorRangeCount,
3837 ranges);
3838 break;
3839 // This is not a character range as defined by the spec but a
3840 // convenient shorthand for a character class that matches any
3841 // character.
3842 case '*':
3843 ranges->Add(CharacterRange::Everything());
3844 break;
ager@chromium.orgddb913d2009-01-27 10:01:48 +00003845 // This is the set of characters matched by the $ and ^ symbols
3846 // in multiline mode.
3847 case 'n':
3848 AddClass(kLineTerminatorRanges,
3849 kLineTerminatorRangeCount,
3850 ranges);
3851 break;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003852 default:
3853 UNREACHABLE();
3854 }
3855}
3856
3857
3858Vector<const uc16> CharacterRange::GetWordBounds() {
3859 return Vector<const uc16>(kWordRanges, kWordRangeCount);
3860}
3861
3862
3863class CharacterRangeSplitter {
3864 public:
3865 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
3866 ZoneList<CharacterRange>** excluded)
3867 : included_(included),
3868 excluded_(excluded) { }
3869 void Call(uc16 from, DispatchTable::Entry entry);
3870
3871 static const int kInBase = 0;
3872 static const int kInOverlay = 1;
3873
3874 private:
3875 ZoneList<CharacterRange>** included_;
3876 ZoneList<CharacterRange>** excluded_;
3877};
3878
3879
3880void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
3881 if (!entry.out_set()->Get(kInBase)) return;
3882 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
3883 ? included_
3884 : excluded_;
3885 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
3886 (*target)->Add(CharacterRange(entry.from(), entry.to()));
3887}
3888
3889
3890void CharacterRange::Split(ZoneList<CharacterRange>* base,
3891 Vector<const uc16> overlay,
3892 ZoneList<CharacterRange>** included,
3893 ZoneList<CharacterRange>** excluded) {
3894 ASSERT_EQ(NULL, *included);
3895 ASSERT_EQ(NULL, *excluded);
3896 DispatchTable table;
3897 for (int i = 0; i < base->length(); i++)
3898 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3899 for (int i = 0; i < overlay.length(); i += 2) {
3900 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3901 CharacterRangeSplitter::kInOverlay);
3902 }
3903 CharacterRangeSplitter callback(included, excluded);
3904 table.ForEach(&callback);
3905}
3906
3907
3908void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
3909 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3910 if (IsSingleton()) {
3911 // If this is a singleton we just expand the one character.
3912 int length = uncanonicalize.get(from(), '\0', chars);
3913 for (int i = 0; i < length; i++) {
3914 uc32 chr = chars[i];
3915 if (chr != from()) {
3916 ranges->Add(CharacterRange::Singleton(chars[i]));
3917 }
3918 }
3919 } else if (from() <= kRangeCanonicalizeMax &&
3920 to() <= kRangeCanonicalizeMax) {
3921 // If this is a range we expand the characters block by block,
3922 // expanding contiguous subranges (blocks) one at a time.
3923 // The approach is as follows. For a given start character we
3924 // look up the block that contains it, for instance 'a' if the
3925 // start character is 'c'. A block is characterized by the property
3926 // that all characters uncanonicalize in the same way as the first
3927 // element, except that each entry in the result is incremented
3928 // by the distance from the first element. So a-z is a block
3929 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3930 // uncanonicalizes to ['a' + k, 'A' + k].
3931 // Once we've found the start point we look up its uncanonicalization
3932 // and produce a range for each element. For instance for [c-f]
3933 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
3934 // add a range if it is not already contained in the input, so [c-f]
3935 // will be skipped but [C-F] will be added. If this range is not
3936 // completely contained in a block we do this for all the blocks
3937 // covered by the range.
3938 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3939 // First, look up the block that contains the 'from' character.
3940 int length = canonrange.get(from(), '\0', range);
3941 if (length == 0) {
3942 range[0] = from();
3943 } else {
3944 ASSERT_EQ(1, length);
3945 }
3946 int pos = from();
3947 // The start of the current block. Note that except for the first
3948 // iteration 'start' is always equal to 'pos'.
3949 int start;
3950 // If it is not the start point of a block the entry contains the
3951 // offset of the character from the start point.
3952 if ((range[0] & kStartMarker) == 0) {
3953 start = pos - range[0];
3954 } else {
3955 start = pos;
3956 }
3957 // Then we add the ranges on at a time, incrementing the current
3958 // position to be after the last block each time. The position
3959 // always points to the start of a block.
3960 while (pos < to()) {
3961 length = canonrange.get(start, '\0', range);
3962 if (length == 0) {
3963 range[0] = start;
3964 } else {
3965 ASSERT_EQ(1, length);
3966 }
3967 ASSERT((range[0] & kStartMarker) != 0);
3968 // The start point of a block contains the distance to the end
3969 // of the range.
3970 int block_end = start + (range[0] & kPayloadMask) - 1;
3971 int end = (block_end > to()) ? to() : block_end;
3972 length = uncanonicalize.get(start, '\0', range);
3973 for (int i = 0; i < length; i++) {
3974 uc32 c = range[i];
3975 uc16 range_from = c + (pos - start);
3976 uc16 range_to = c + (end - start);
3977 if (!(from() <= range_from && range_to <= to())) {
3978 ranges->Add(CharacterRange(range_from, range_to));
3979 }
3980 }
3981 start = pos = block_end + 1;
3982 }
3983 } else {
3984 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
3985 }
3986}
3987
3988
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00003989ZoneList<CharacterRange>* CharacterSet::ranges() {
3990 if (ranges_ == NULL) {
3991 ranges_ = new ZoneList<CharacterRange>(2);
3992 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
3993 }
3994 return ranges_;
3995}
3996
3997
3998
ager@chromium.orga74f0da2008-12-03 16:05:52 +00003999// -------------------------------------------------------------------
4000// Interest propagation
4001
4002
4003RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4004 for (int i = 0; i < siblings_.length(); i++) {
4005 RegExpNode* sibling = siblings_.Get(i);
4006 if (sibling->info()->Matches(info))
4007 return sibling;
4008 }
4009 return NULL;
4010}
4011
4012
4013RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4014 ASSERT_EQ(false, *cloned);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004015 siblings_.Ensure(this);
4016 RegExpNode* result = TryGetSibling(info);
4017 if (result != NULL) return result;
4018 result = this->Clone();
4019 NodeInfo* new_info = result->info();
4020 new_info->ResetCompilationState();
4021 new_info->AddFromPreceding(info);
4022 AddSibling(result);
4023 *cloned = true;
4024 return result;
4025}
4026
4027
4028template <class C>
4029static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4030 NodeInfo full_info(*node->info());
4031 full_info.AddFromPreceding(info);
4032 bool cloned = false;
4033 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4034}
4035
4036
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004037// -------------------------------------------------------------------
4038// Splay tree
4039
4040
4041OutSet* OutSet::Extend(unsigned value) {
4042 if (Get(value))
4043 return this;
4044 if (successors() != NULL) {
4045 for (int i = 0; i < successors()->length(); i++) {
4046 OutSet* successor = successors()->at(i);
4047 if (successor->Get(value))
4048 return successor;
4049 }
4050 } else {
4051 successors_ = new ZoneList<OutSet*>(2);
4052 }
4053 OutSet* result = new OutSet(first_, remaining_);
4054 result->Set(value);
4055 successors()->Add(result);
4056 return result;
4057}
4058
4059
4060void OutSet::Set(unsigned value) {
4061 if (value < kFirstLimit) {
4062 first_ |= (1 << value);
4063 } else {
4064 if (remaining_ == NULL)
4065 remaining_ = new ZoneList<unsigned>(1);
4066 if (remaining_->is_empty() || !remaining_->Contains(value))
4067 remaining_->Add(value);
4068 }
4069}
4070
4071
4072bool OutSet::Get(unsigned value) {
4073 if (value < kFirstLimit) {
4074 return (first_ & (1 << value)) != 0;
4075 } else if (remaining_ == NULL) {
4076 return false;
4077 } else {
4078 return remaining_->Contains(value);
4079 }
4080}
4081
4082
4083const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4084const DispatchTable::Entry DispatchTable::Config::kNoValue;
4085
4086
4087void DispatchTable::AddRange(CharacterRange full_range, int value) {
4088 CharacterRange current = full_range;
4089 if (tree()->is_empty()) {
4090 // If this is the first range we just insert into the table.
4091 ZoneSplayTree<Config>::Locator loc;
4092 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4093 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4094 return;
4095 }
4096 // First see if there is a range to the left of this one that
4097 // overlaps.
4098 ZoneSplayTree<Config>::Locator loc;
4099 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4100 Entry* entry = &loc.value();
4101 // If we've found a range that overlaps with this one, and it
4102 // starts strictly to the left of this one, we have to fix it
4103 // because the following code only handles ranges that start on
4104 // or after the start point of the range we're adding.
4105 if (entry->from() < current.from() && entry->to() >= current.from()) {
4106 // Snap the overlapping range in half around the start point of
4107 // the range we're adding.
4108 CharacterRange left(entry->from(), current.from() - 1);
4109 CharacterRange right(current.from(), entry->to());
4110 // The left part of the overlapping range doesn't overlap.
4111 // Truncate the whole entry to be just the left part.
4112 entry->set_to(left.to());
4113 // The right part is the one that overlaps. We add this part
4114 // to the map and let the next step deal with merging it with
4115 // the range we're adding.
4116 ZoneSplayTree<Config>::Locator loc;
4117 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4118 loc.set_value(Entry(right.from(),
4119 right.to(),
4120 entry->out_set()));
4121 }
4122 }
4123 while (current.is_valid()) {
4124 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4125 (loc.value().from() <= current.to()) &&
4126 (loc.value().to() >= current.from())) {
4127 Entry* entry = &loc.value();
4128 // We have overlap. If there is space between the start point of
4129 // the range we're adding and where the overlapping range starts
4130 // then we have to add a range covering just that space.
4131 if (current.from() < entry->from()) {
4132 ZoneSplayTree<Config>::Locator ins;
4133 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4134 ins.set_value(Entry(current.from(),
4135 entry->from() - 1,
4136 empty()->Extend(value)));
4137 current.set_from(entry->from());
4138 }
4139 ASSERT_EQ(current.from(), entry->from());
4140 // If the overlapping range extends beyond the one we want to add
4141 // we have to snap the right part off and add it separately.
4142 if (entry->to() > current.to()) {
4143 ZoneSplayTree<Config>::Locator ins;
4144 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4145 ins.set_value(Entry(current.to() + 1,
4146 entry->to(),
4147 entry->out_set()));
4148 entry->set_to(current.to());
4149 }
4150 ASSERT(entry->to() <= current.to());
4151 // The overlapping range is now completely contained by the range
4152 // we're adding so we can just update it and move the start point
4153 // of the range we're adding just past it.
4154 entry->AddValue(value);
4155 // Bail out if the last interval ended at 0xFFFF since otherwise
4156 // adding 1 will wrap around to 0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004157 if (entry->to() == String::kMaxUC16CharCode)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004158 break;
4159 ASSERT(entry->to() + 1 > current.from());
4160 current.set_from(entry->to() + 1);
4161 } else {
4162 // There is no overlap so we can just add the range
4163 ZoneSplayTree<Config>::Locator ins;
4164 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4165 ins.set_value(Entry(current.from(),
4166 current.to(),
4167 empty()->Extend(value)));
4168 break;
4169 }
4170 }
4171}
4172
4173
4174OutSet* DispatchTable::Get(uc16 value) {
4175 ZoneSplayTree<Config>::Locator loc;
4176 if (!tree()->FindGreatestLessThan(value, &loc))
4177 return empty();
4178 Entry* entry = &loc.value();
4179 if (value <= entry->to())
4180 return entry->out_set();
4181 else
4182 return empty();
4183}
4184
4185
4186// -------------------------------------------------------------------
4187// Analysis
4188
4189
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004190void Analysis::EnsureAnalyzed(RegExpNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004191 if (that->info()->been_analyzed || that->info()->being_analyzed)
4192 return;
4193 that->info()->being_analyzed = true;
4194 that->Accept(this);
4195 that->info()->being_analyzed = false;
4196 that->info()->been_analyzed = true;
4197}
4198
4199
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004200void Analysis::VisitEnd(EndNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004201 // nothing to do
4202}
4203
4204
ager@chromium.org8bb60582008-12-11 12:02:20 +00004205void TextNode::CalculateOffsets() {
4206 int element_count = elements()->length();
4207 // Set up the offsets of the elements relative to the start. This is a fixed
4208 // quantity since a TextNode can only contain fixed-width things.
4209 int cp_offset = 0;
4210 for (int i = 0; i < element_count; i++) {
4211 TextElement& elm = elements()->at(i);
4212 elm.cp_offset = cp_offset;
4213 if (elm.type == TextElement::ATOM) {
4214 cp_offset += elm.data.u_atom->data().length();
4215 } else {
4216 cp_offset++;
4217 Vector<const uc16> quarks = elm.data.u_atom->data();
4218 }
4219 }
4220}
4221
4222
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004223void Analysis::VisitText(TextNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004224 if (ignore_case_) {
4225 that->MakeCaseIndependent();
4226 }
4227 EnsureAnalyzed(that->on_success());
ager@chromium.org8bb60582008-12-11 12:02:20 +00004228 that->CalculateOffsets();
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004229}
4230
4231
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004232void Analysis::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004233 RegExpNode* target = that->on_success();
4234 EnsureAnalyzed(target);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004235 // If the next node is interested in what it follows then this node
4236 // has to be interested too so it can pass the information on.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004237 that->info()->AddFromFollowing(target->info());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004238}
4239
4240
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004241void Analysis::VisitChoice(ChoiceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004242 NodeInfo* info = that->info();
4243 for (int i = 0; i < that->alternatives()->length(); i++) {
4244 RegExpNode* node = that->alternatives()->at(i).node();
4245 EnsureAnalyzed(node);
4246 // Anything the following nodes need to know has to be known by
4247 // this node also, so it can pass it on.
4248 info->AddFromFollowing(node->info());
4249 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004250}
4251
4252
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004253void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4254 NodeInfo* info = that->info();
4255 for (int i = 0; i < that->alternatives()->length(); i++) {
4256 RegExpNode* node = that->alternatives()->at(i).node();
4257 if (node != that->loop_node()) {
4258 EnsureAnalyzed(node);
4259 info->AddFromFollowing(node->info());
4260 }
4261 }
4262 // Check the loop last since it may need the value of this node
4263 // to get a correct result.
4264 EnsureAnalyzed(that->loop_node());
4265 info->AddFromFollowing(that->loop_node()->info());
4266}
4267
4268
4269void Analysis::VisitBackReference(BackReferenceNode* that) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004270 EnsureAnalyzed(that->on_success());
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004271}
4272
4273
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004274void Analysis::VisitAssertion(AssertionNode* that) {
4275 EnsureAnalyzed(that->on_success());
4276}
4277
4278
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004279// -------------------------------------------------------------------
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004280// Dispatch table construction
4281
4282
4283void DispatchTableConstructor::VisitEnd(EndNode* that) {
4284 AddRange(CharacterRange::Everything());
4285}
4286
4287
4288void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4289 node->set_being_calculated(true);
4290 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4291 for (int i = 0; i < alternatives->length(); i++) {
4292 set_choice_index(i);
4293 alternatives->at(i).node()->Accept(this);
4294 }
4295 node->set_being_calculated(false);
4296}
4297
4298
4299class AddDispatchRange {
4300 public:
4301 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4302 : constructor_(constructor) { }
4303 void Call(uc32 from, DispatchTable::Entry entry);
4304 private:
4305 DispatchTableConstructor* constructor_;
4306};
4307
4308
4309void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4310 CharacterRange range(from, entry.to());
4311 constructor_->AddRange(range);
4312}
4313
4314
4315void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4316 if (node->being_calculated())
4317 return;
4318 DispatchTable* table = node->GetTable(ignore_case_);
4319 AddDispatchRange adder(this);
4320 table->ForEach(&adder);
4321}
4322
4323
4324void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4325 // TODO(160): Find the node that we refer back to and propagate its start
4326 // set back to here. For now we just accept anything.
4327 AddRange(CharacterRange::Everything());
4328}
4329
4330
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004331void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4332 RegExpNode* target = that->on_success();
4333 target->Accept(this);
4334}
4335
4336
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004337
4338static int CompareRangeByFrom(const CharacterRange* a,
4339 const CharacterRange* b) {
4340 return Compare<uc16>(a->from(), b->from());
4341}
4342
4343
4344void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4345 ranges->Sort(CompareRangeByFrom);
4346 uc16 last = 0;
4347 for (int i = 0; i < ranges->length(); i++) {
4348 CharacterRange range = ranges->at(i);
4349 if (last < range.from())
4350 AddRange(CharacterRange(last, range.from() - 1));
4351 if (range.to() >= last) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004352 if (range.to() == String::kMaxUC16CharCode) {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004353 return;
4354 } else {
4355 last = range.to() + 1;
4356 }
4357 }
4358 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004359 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004360}
4361
4362
4363void DispatchTableConstructor::VisitText(TextNode* that) {
4364 TextElement elm = that->elements()->at(0);
4365 switch (elm.type) {
4366 case TextElement::ATOM: {
4367 uc16 c = elm.data.u_atom->data()[0];
4368 AddRange(CharacterRange(c, c));
4369 break;
4370 }
4371 case TextElement::CHAR_CLASS: {
4372 RegExpCharacterClass* tree = elm.data.u_char_class;
4373 ZoneList<CharacterRange>* ranges = tree->ranges();
4374 if (tree->is_negated()) {
4375 AddInverse(ranges);
4376 } else {
4377 for (int i = 0; i < ranges->length(); i++)
4378 AddRange(ranges->at(i));
4379 }
4380 break;
4381 }
4382 default: {
4383 UNIMPLEMENTED();
4384 }
4385 }
4386}
4387
4388
4389void DispatchTableConstructor::VisitAction(ActionNode* that) {
ager@chromium.org8bb60582008-12-11 12:02:20 +00004390 RegExpNode* target = that->on_success();
4391 target->Accept(this);
4392}
4393
4394
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00004395RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
4396 bool ignore_case,
4397 bool is_multiline,
4398 Handle<String> pattern,
4399 bool is_ascii) {
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004400 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
kasperl@chromium.org7be3c992009-03-12 07:19:55 +00004401 return IrregexpRegExpTooBig();
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004402 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00004403 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004404 // Wrap the body of the regexp in capture #0.
ager@chromium.org8bb60582008-12-11 12:02:20 +00004405 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004406 0,
4407 &compiler,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004408 compiler.accept());
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004409 RegExpNode* node = captured_body;
4410 if (!data->tree->IsAnchored()) {
4411 // Add a .*? at the beginning, outside the body capture, unless
4412 // this expression is anchored at the beginning.
iposva@chromium.org245aa852009-02-10 00:49:54 +00004413 RegExpNode* loop_node =
4414 RegExpQuantifier::ToNode(0,
4415 RegExpTree::kInfinity,
4416 false,
4417 new RegExpCharacterClass('*'),
4418 &compiler,
4419 captured_body,
4420 data->contains_anchor);
4421
4422 if (data->contains_anchor) {
4423 // Unroll loop once, to take care of the case that might start
4424 // at the start of input.
4425 ChoiceNode* first_step_node = new ChoiceNode(2);
4426 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4427 first_step_node->AddAlternative(GuardedAlternative(
4428 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4429 node = first_step_node;
4430 } else {
4431 node = loop_node;
4432 }
ager@chromium.orgddb913d2009-01-27 10:01:48 +00004433 }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00004434 data->node = node;
4435 Analysis analysis(ignore_case);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004436 analysis.EnsureAnalyzed(node);
4437
4438 NodeInfo info = *node->info();
ager@chromium.org8bb60582008-12-11 12:02:20 +00004439
ager@chromium.orgbb29dc92009-03-24 13:25:23 +00004440 if (RegExpImpl::UseNativeRegexp()) {
ager@chromium.org9085a012009-05-11 19:22:57 +00004441#ifdef V8_TARGET_ARCH_ARM
ager@chromium.orgbb29dc92009-03-24 13:25:23 +00004442 UNREACHABLE();
ager@chromium.org5ec48922009-05-05 07:25:34 +00004443#endif
ager@chromium.org9085a012009-05-11 19:22:57 +00004444#ifdef V8_TARGET_ARCH_X64
ager@chromium.org5ec48922009-05-05 07:25:34 +00004445 UNREACHABLE();
4446#endif
ager@chromium.org9085a012009-05-11 19:22:57 +00004447#ifdef V8_TARGET_ARCH_IA32
ager@chromium.org8bb60582008-12-11 12:02:20 +00004448 RegExpMacroAssemblerIA32::Mode mode;
4449 if (is_ascii) {
4450 mode = RegExpMacroAssemblerIA32::ASCII;
4451 } else {
4452 mode = RegExpMacroAssemblerIA32::UC16;
4453 }
4454 RegExpMacroAssemblerIA32 macro_assembler(mode,
4455 (data->capture_count + 1) * 2);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004456 return compiler.Assemble(&macro_assembler,
4457 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004458 data->capture_count,
4459 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004460#endif
4461 }
4462 EmbeddedVector<byte, 1024> codes;
4463 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4464 return compiler.Assemble(&macro_assembler,
4465 node,
ager@chromium.org8bb60582008-12-11 12:02:20 +00004466 data->capture_count,
4467 pattern);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00004468}
4469
4470
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00004471}} // namespace v8::internal