blob: e518662d071fff60837c2925f0fa3bea7020fb74 [file] [log] [blame]
Steve Blocka7e24c12009-10-30 11:49:00 +00001// Copyright 2006-2009 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
30#include "ast.h"
31#include "compiler.h"
32#include "execution.h"
33#include "factory.h"
34#include "jsregexp.h"
35#include "platform.h"
36#include "runtime.h"
37#include "top.h"
38#include "compilation-cache.h"
39#include "string-stream.h"
40#include "parser.h"
41#include "regexp-macro-assembler.h"
42#include "regexp-macro-assembler-tracer.h"
43#include "regexp-macro-assembler-irregexp.h"
44#include "regexp-stack.h"
45
46#ifdef V8_NATIVE_REGEXP
47#if V8_TARGET_ARCH_IA32
48#include "ia32/macro-assembler-ia32.h"
49#include "ia32/regexp-macro-assembler-ia32.h"
50#elif V8_TARGET_ARCH_X64
51#include "x64/macro-assembler-x64.h"
52#include "x64/regexp-macro-assembler-x64.h"
53#elif V8_TARGET_ARCH_ARM
54#include "arm/macro-assembler-arm.h"
55#include "arm/regexp-macro-assembler-arm.h"
56#else
57#error Unsupported target architecture.
58#endif
59#endif
60
61#include "interpreter-irregexp.h"
62
63
64namespace v8 {
65namespace internal {
66
67
68Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
69 Handle<String> pattern,
70 Handle<String> flags,
71 bool* has_pending_exception) {
72 // Ensure that the constructor function has been loaded.
73 if (!constructor->IsLoaded()) {
74 LoadLazy(constructor, has_pending_exception);
75 if (*has_pending_exception) return Handle<Object>();
76 }
77 // Call the construct code with 2 arguments.
78 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
79 Handle<Object>::cast(flags).location() };
80 return Execution::New(constructor, 2, argv, has_pending_exception);
81}
82
83
84static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
85 int flags = JSRegExp::NONE;
86 for (int i = 0; i < str->length(); i++) {
87 switch (str->Get(i)) {
88 case 'i':
89 flags |= JSRegExp::IGNORE_CASE;
90 break;
91 case 'g':
92 flags |= JSRegExp::GLOBAL;
93 break;
94 case 'm':
95 flags |= JSRegExp::MULTILINE;
96 break;
97 }
98 }
99 return JSRegExp::Flags(flags);
100}
101
102
103static inline void ThrowRegExpException(Handle<JSRegExp> re,
104 Handle<String> pattern,
105 Handle<String> error_text,
106 const char* message) {
107 Handle<JSArray> array = Factory::NewJSArray(2);
108 SetElement(array, 0, pattern);
109 SetElement(array, 1, error_text);
110 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
111 Top::Throw(*regexp_err);
112}
113
114
115// Generic RegExp methods. Dispatches to implementation specific methods.
116
117
118class OffsetsVector {
119 public:
120 inline OffsetsVector(int num_registers)
121 : offsets_vector_length_(num_registers) {
122 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
123 vector_ = NewArray<int>(offsets_vector_length_);
124 } else {
125 vector_ = static_offsets_vector_;
126 }
127 }
128 inline ~OffsetsVector() {
129 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
130 DeleteArray(vector_);
131 vector_ = NULL;
132 }
133 }
134 inline int* vector() { return vector_; }
135 inline int length() { return offsets_vector_length_; }
136
137 private:
138 int* vector_;
139 int offsets_vector_length_;
140 static const int kStaticOffsetsVectorSize = 50;
141 static int static_offsets_vector_[kStaticOffsetsVectorSize];
142};
143
144
145int OffsetsVector::static_offsets_vector_[
146 OffsetsVector::kStaticOffsetsVectorSize];
147
148
149Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
150 Handle<String> pattern,
151 Handle<String> flag_str) {
152 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
153 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
154 bool in_cache = !cached.is_null();
155 LOG(RegExpCompileEvent(re, in_cache));
156
157 Handle<Object> result;
158 if (in_cache) {
159 re->set_data(*cached);
160 return re;
161 }
162 FlattenString(pattern);
163 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
164 RegExpCompileData parse_result;
165 FlatStringReader reader(pattern);
166 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
167 // Throw an exception if we fail to parse the pattern.
168 ThrowRegExpException(re,
169 pattern,
170 parse_result.error,
171 "malformed_regexp");
172 return Handle<Object>::null();
173 }
174
175 if (parse_result.simple && !flags.is_ignore_case()) {
176 // Parse-tree is a single atom that is equal to the pattern.
177 AtomCompile(re, pattern, flags, pattern);
178 } else if (parse_result.tree->IsAtom() &&
179 !flags.is_ignore_case() &&
180 parse_result.capture_count == 0) {
181 RegExpAtom* atom = parse_result.tree->AsAtom();
182 Vector<const uc16> atom_pattern = atom->data();
183 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
184 AtomCompile(re, pattern, flags, atom_string);
185 } else {
186 IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
187 }
188 ASSERT(re->data()->IsFixedArray());
189 // Compilation succeeded so the data is set on the regexp
190 // and we can store it in the cache.
191 Handle<FixedArray> data(FixedArray::cast(re->data()));
192 CompilationCache::PutRegExp(pattern, flags, data);
193
194 return re;
195}
196
197
198Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
199 Handle<String> subject,
200 int index,
201 Handle<JSArray> last_match_info) {
202 switch (regexp->TypeTag()) {
203 case JSRegExp::ATOM:
204 return AtomExec(regexp, subject, index, last_match_info);
205 case JSRegExp::IRREGEXP: {
206 Handle<Object> result =
207 IrregexpExec(regexp, subject, index, last_match_info);
208 ASSERT(!result.is_null() || Top::has_pending_exception());
209 return result;
210 }
211 default:
212 UNREACHABLE();
213 return Handle<Object>::null();
214 }
215}
216
217
218// RegExp Atom implementation: Simple string search using indexOf.
219
220
221void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
222 Handle<String> pattern,
223 JSRegExp::Flags flags,
224 Handle<String> match_pattern) {
225 Factory::SetRegExpAtomData(re,
226 JSRegExp::ATOM,
227 pattern,
228 flags,
229 match_pattern);
230}
231
232
233static void SetAtomLastCapture(FixedArray* array,
234 String* subject,
235 int from,
236 int to) {
237 NoHandleAllocation no_handles;
238 RegExpImpl::SetLastCaptureCount(array, 2);
239 RegExpImpl::SetLastSubject(array, subject);
240 RegExpImpl::SetLastInput(array, subject);
241 RegExpImpl::SetCapture(array, 0, from);
242 RegExpImpl::SetCapture(array, 1, to);
243}
244
245
246Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
247 Handle<String> subject,
248 int index,
249 Handle<JSArray> last_match_info) {
250 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
251
252 uint32_t start_index = index;
253
254 int value = Runtime::StringMatch(subject, needle, start_index);
255 if (value == -1) return Factory::null_value();
256 ASSERT(last_match_info->HasFastElements());
257
258 {
259 NoHandleAllocation no_handles;
260 FixedArray* array = FixedArray::cast(last_match_info->elements());
261 SetAtomLastCapture(array, *subject, value, value + needle->length());
262 }
263 return last_match_info;
264}
265
266
267// Irregexp implementation.
268
269// Ensures that the regexp object contains a compiled version of the
270// source for either ASCII or non-ASCII strings.
271// If the compiled version doesn't already exist, it is compiled
272// from the source pattern.
273// If compilation fails, an exception is thrown and this function
274// returns false.
275bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
276 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
277#ifdef V8_NATIVE_REGEXP
278 if (compiled_code->IsCode()) return true;
279#else // ! V8_NATIVE_REGEXP (RegExp interpreter code)
280 if (compiled_code->IsByteArray()) return true;
281#endif
282 return CompileIrregexp(re, is_ascii);
283}
284
285
286bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
287 // Compile the RegExp.
288 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
289 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
290 if (entry->IsJSObject()) {
291 // If it's a JSObject, a previous compilation failed and threw this object.
292 // Re-throw the object without trying again.
293 Top::Throw(entry);
294 return false;
295 }
296 ASSERT(entry->IsTheHole());
297
298 JSRegExp::Flags flags = re->GetFlags();
299
300 Handle<String> pattern(re->Pattern());
301 if (!pattern->IsFlat()) {
302 FlattenString(pattern);
303 }
304
305 RegExpCompileData compile_data;
306 FlatStringReader reader(pattern);
307 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
308 // Throw an exception if we fail to parse the pattern.
309 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
310 ThrowRegExpException(re,
311 pattern,
312 compile_data.error,
313 "malformed_regexp");
314 return false;
315 }
316 RegExpEngine::CompilationResult result =
317 RegExpEngine::Compile(&compile_data,
318 flags.is_ignore_case(),
319 flags.is_multiline(),
320 pattern,
321 is_ascii);
322 if (result.error_message != NULL) {
323 // Unable to compile regexp.
324 Handle<JSArray> array = Factory::NewJSArray(2);
325 SetElement(array, 0, pattern);
326 SetElement(array,
327 1,
328 Factory::NewStringFromUtf8(CStrVector(result.error_message)));
329 Handle<Object> regexp_err =
330 Factory::NewSyntaxError("malformed_regexp", array);
331 Top::Throw(*regexp_err);
332 re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
333 return false;
334 }
335
336 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
337 data->set(JSRegExp::code_index(is_ascii), result.code);
338 int register_max = IrregexpMaxRegisterCount(*data);
339 if (result.num_registers > register_max) {
340 SetIrregexpMaxRegisterCount(*data, result.num_registers);
341 }
342
343 return true;
344}
345
346
347int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
348 return Smi::cast(
349 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
350}
351
352
353void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
354 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
355}
356
357
358int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
359 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
360}
361
362
363int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
364 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
365}
366
367
368ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
369 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
370}
371
372
373Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
374 return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
375}
376
377
378void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
379 Handle<String> pattern,
380 JSRegExp::Flags flags,
381 int capture_count) {
382 // Initialize compiled code entries to null.
383 Factory::SetRegExpIrregexpData(re,
384 JSRegExp::IRREGEXP,
385 pattern,
386 flags,
387 capture_count);
388}
389
390
391Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
392 Handle<String> subject,
393 int previous_index,
394 Handle<JSArray> last_match_info) {
395 ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
396
397 // Prepare space for the return values.
398 int number_of_capture_registers =
399 (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
400
401#ifndef V8_NATIVE_REGEXP
402#ifdef DEBUG
403 if (FLAG_trace_regexp_bytecodes) {
404 String* pattern = jsregexp->Pattern();
405 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
406 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
407 }
408#endif
409#endif
410
411 if (!subject->IsFlat()) {
412 FlattenString(subject);
413 }
414
415 last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
416
417 Handle<FixedArray> array;
418
419 // Dispatch to the correct RegExp implementation.
420 Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
421
422#ifdef V8_NATIVE_REGEXP
423
424 OffsetsVector captures(number_of_capture_registers);
425 int* captures_vector = captures.vector();
426 NativeRegExpMacroAssembler::Result res;
427 do {
428 bool is_ascii = subject->IsAsciiRepresentation();
429 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
430 return Handle<Object>::null();
431 }
432 Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
433 res = NativeRegExpMacroAssembler::Match(code,
434 subject,
435 captures_vector,
436 captures.length(),
437 previous_index);
438 // If result is RETRY, the string have changed representation, and we
439 // must restart from scratch.
440 } while (res == NativeRegExpMacroAssembler::RETRY);
441 if (res == NativeRegExpMacroAssembler::EXCEPTION) {
442 ASSERT(Top::has_pending_exception());
443 return Handle<Object>::null();
444 }
445 ASSERT(res == NativeRegExpMacroAssembler::SUCCESS
446 || res == NativeRegExpMacroAssembler::FAILURE);
447
448 if (res != NativeRegExpMacroAssembler::SUCCESS) return Factory::null_value();
449
450 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
451 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
452 // The captures come in (start, end+1) pairs.
453 for (int i = 0; i < number_of_capture_registers; i += 2) {
454 SetCapture(*array, i, captures_vector[i]);
455 SetCapture(*array, i + 1, captures_vector[i + 1]);
456 }
457
458#else // ! V8_NATIVE_REGEXP
459
460 bool is_ascii = subject->IsAsciiRepresentation();
461 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
462 return Handle<Object>::null();
463 }
464 // Now that we have done EnsureCompiledIrregexp we can get the number of
465 // registers.
466 int number_of_registers =
467 IrregexpNumberOfRegisters(FixedArray::cast(jsregexp->data()));
468 OffsetsVector registers(number_of_registers);
469 int* register_vector = registers.vector();
470 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
471 register_vector[i] = -1;
472 }
473 Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
474
475 if (!IrregexpInterpreter::Match(byte_codes,
476 subject,
477 register_vector,
478 previous_index)) {
479 return Factory::null_value();
480 }
481
482 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
483 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
484 // The captures come in (start, end+1) pairs.
485 for (int i = 0; i < number_of_capture_registers; i += 2) {
486 SetCapture(*array, i, register_vector[i]);
487 SetCapture(*array, i + 1, register_vector[i + 1]);
488 }
489
490#endif // V8_NATIVE_REGEXP
491
492 SetLastCaptureCount(*array, number_of_capture_registers);
493 SetLastSubject(*array, *subject);
494 SetLastInput(*array, *subject);
495
496 return last_match_info;
497}
498
499
500// -------------------------------------------------------------------
501// Implementation of the Irregexp regular expression engine.
502//
503// The Irregexp regular expression engine is intended to be a complete
504// implementation of ECMAScript regular expressions. It generates either
505// bytecodes or native code.
506
507// The Irregexp regexp engine is structured in three steps.
508// 1) The parser generates an abstract syntax tree. See ast.cc.
509// 2) From the AST a node network is created. The nodes are all
510// subclasses of RegExpNode. The nodes represent states when
511// executing a regular expression. Several optimizations are
512// performed on the node network.
513// 3) From the nodes we generate either byte codes or native code
514// that can actually execute the regular expression (perform
515// the search). The code generation step is described in more
516// detail below.
517
518// Code generation.
519//
520// The nodes are divided into four main categories.
521// * Choice nodes
522// These represent places where the regular expression can
523// match in more than one way. For example on entry to an
524// alternation (foo|bar) or a repetition (*, +, ? or {}).
525// * Action nodes
526// These represent places where some action should be
527// performed. Examples include recording the current position
528// in the input string to a register (in order to implement
529// captures) or other actions on register for example in order
530// to implement the counters needed for {} repetitions.
531// * Matching nodes
532// These attempt to match some element part of the input string.
533// Examples of elements include character classes, plain strings
534// or back references.
535// * End nodes
536// These are used to implement the actions required on finding
537// a successful match or failing to find a match.
538//
539// The code generated (whether as byte codes or native code) maintains
540// some state as it runs. This consists of the following elements:
541//
542// * The capture registers. Used for string captures.
543// * Other registers. Used for counters etc.
544// * The current position.
545// * The stack of backtracking information. Used when a matching node
546// fails to find a match and needs to try an alternative.
547//
548// Conceptual regular expression execution model:
549//
550// There is a simple conceptual model of regular expression execution
551// which will be presented first. The actual code generated is a more
552// efficient simulation of the simple conceptual model:
553//
554// * Choice nodes are implemented as follows:
555// For each choice except the last {
556// push current position
557// push backtrack code location
558// <generate code to test for choice>
559// backtrack code location:
560// pop current position
561// }
562// <generate code to test for last choice>
563//
564// * Actions nodes are generated as follows
565// <push affected registers on backtrack stack>
566// <generate code to perform action>
567// push backtrack code location
568// <generate code to test for following nodes>
569// backtrack code location:
570// <pop affected registers to restore their state>
571// <pop backtrack location from stack and go to it>
572//
573// * Matching nodes are generated as follows:
574// if input string matches at current position
575// update current position
576// <generate code to test for following nodes>
577// else
578// <pop backtrack location from stack and go to it>
579//
580// Thus it can be seen that the current position is saved and restored
581// by the choice nodes, whereas the registers are saved and restored by
582// by the action nodes that manipulate them.
583//
584// The other interesting aspect of this model is that nodes are generated
585// at the point where they are needed by a recursive call to Emit(). If
586// the node has already been code generated then the Emit() call will
587// generate a jump to the previously generated code instead. In order to
588// limit recursion it is possible for the Emit() function to put the node
589// on a work list for later generation and instead generate a jump. The
590// destination of the jump is resolved later when the code is generated.
591//
592// Actual regular expression code generation.
593//
594// Code generation is actually more complicated than the above. In order
595// to improve the efficiency of the generated code some optimizations are
596// performed
597//
598// * Choice nodes have 1-character lookahead.
599// A choice node looks at the following character and eliminates some of
600// the choices immediately based on that character. This is not yet
601// implemented.
602// * Simple greedy loops store reduced backtracking information.
603// A quantifier like /.*foo/m will greedily match the whole input. It will
604// then need to backtrack to a point where it can match "foo". The naive
605// implementation of this would push each character position onto the
606// backtracking stack, then pop them off one by one. This would use space
607// proportional to the length of the input string. However since the "."
608// can only match in one way and always has a constant length (in this case
609// of 1) it suffices to store the current position on the top of the stack
610// once. Matching now becomes merely incrementing the current position and
611// backtracking becomes decrementing the current position and checking the
612// result against the stored current position. This is faster and saves
613// space.
614// * The current state is virtualized.
615// This is used to defer expensive operations until it is clear that they
616// are needed and to generate code for a node more than once, allowing
617// specialized an efficient versions of the code to be created. This is
618// explained in the section below.
619//
620// Execution state virtualization.
621//
622// Instead of emitting code, nodes that manipulate the state can record their
623// manipulation in an object called the Trace. The Trace object can record a
624// current position offset, an optional backtrack code location on the top of
625// the virtualized backtrack stack and some register changes. When a node is
626// to be emitted it can flush the Trace or update it. Flushing the Trace
627// will emit code to bring the actual state into line with the virtual state.
628// Avoiding flushing the state can postpone some work (eg updates of capture
629// registers). Postponing work can save time when executing the regular
630// expression since it may be found that the work never has to be done as a
631// failure to match can occur. In addition it is much faster to jump to a
632// known backtrack code location than it is to pop an unknown backtrack
633// location from the stack and jump there.
634//
635// The virtual state found in the Trace affects code generation. For example
636// the virtual state contains the difference between the actual current
637// position and the virtual current position, and matching code needs to use
638// this offset to attempt a match in the correct location of the input
639// string. Therefore code generated for a non-trivial trace is specialized
640// to that trace. The code generator therefore has the ability to generate
641// code for each node several times. In order to limit the size of the
642// generated code there is an arbitrary limit on how many specialized sets of
643// code may be generated for a given node. If the limit is reached, the
644// trace is flushed and a generic version of the code for a node is emitted.
645// This is subsequently used for that node. The code emitted for non-generic
646// trace is not recorded in the node and so it cannot currently be reused in
647// the event that code generation is requested for an identical trace.
648
649
650void RegExpTree::AppendToText(RegExpText* text) {
651 UNREACHABLE();
652}
653
654
655void RegExpAtom::AppendToText(RegExpText* text) {
656 text->AddElement(TextElement::Atom(this));
657}
658
659
660void RegExpCharacterClass::AppendToText(RegExpText* text) {
661 text->AddElement(TextElement::CharClass(this));
662}
663
664
665void RegExpText::AppendToText(RegExpText* text) {
666 for (int i = 0; i < elements()->length(); i++)
667 text->AddElement(elements()->at(i));
668}
669
670
671TextElement TextElement::Atom(RegExpAtom* atom) {
672 TextElement result = TextElement(ATOM);
673 result.data.u_atom = atom;
674 return result;
675}
676
677
678TextElement TextElement::CharClass(
679 RegExpCharacterClass* char_class) {
680 TextElement result = TextElement(CHAR_CLASS);
681 result.data.u_char_class = char_class;
682 return result;
683}
684
685
686int TextElement::length() {
687 if (type == ATOM) {
688 return data.u_atom->length();
689 } else {
690 ASSERT(type == CHAR_CLASS);
691 return 1;
692 }
693}
694
695
696DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
697 if (table_ == NULL) {
698 table_ = new DispatchTable();
699 DispatchTableConstructor cons(table_, ignore_case);
700 cons.BuildTable(this);
701 }
702 return table_;
703}
704
705
706class RegExpCompiler {
707 public:
708 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
709
710 int AllocateRegister() {
711 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
712 reg_exp_too_big_ = true;
713 return next_register_;
714 }
715 return next_register_++;
716 }
717
718 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
719 RegExpNode* start,
720 int capture_count,
721 Handle<String> pattern);
722
723 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
724
725 static const int kImplementationOffset = 0;
726 static const int kNumberOfRegistersOffset = 0;
727 static const int kCodeOffset = 1;
728
729 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
730 EndNode* accept() { return accept_; }
731
732 static const int kMaxRecursion = 100;
733 inline int recursion_depth() { return recursion_depth_; }
734 inline void IncrementRecursionDepth() { recursion_depth_++; }
735 inline void DecrementRecursionDepth() { recursion_depth_--; }
736
737 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
738
739 inline bool ignore_case() { return ignore_case_; }
740 inline bool ascii() { return ascii_; }
741
742 static const int kNoRegister = -1;
743 private:
744 EndNode* accept_;
745 int next_register_;
746 List<RegExpNode*>* work_list_;
747 int recursion_depth_;
748 RegExpMacroAssembler* macro_assembler_;
749 bool ignore_case_;
750 bool ascii_;
751 bool reg_exp_too_big_;
752};
753
754
755class RecursionCheck {
756 public:
757 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
758 compiler->IncrementRecursionDepth();
759 }
760 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
761 private:
762 RegExpCompiler* compiler_;
763};
764
765
766static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
767 return RegExpEngine::CompilationResult("RegExp too big");
768}
769
770
771// Attempts to compile the regexp using an Irregexp code generator. Returns
772// a fixed array or a null handle depending on whether it succeeded.
773RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
774 : next_register_(2 * (capture_count + 1)),
775 work_list_(NULL),
776 recursion_depth_(0),
777 ignore_case_(ignore_case),
778 ascii_(ascii),
779 reg_exp_too_big_(false) {
780 accept_ = new EndNode(EndNode::ACCEPT);
781 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
782}
783
784
785RegExpEngine::CompilationResult RegExpCompiler::Assemble(
786 RegExpMacroAssembler* macro_assembler,
787 RegExpNode* start,
788 int capture_count,
789 Handle<String> pattern) {
790#ifdef DEBUG
791 if (FLAG_trace_regexp_assembler)
792 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
793 else
794#endif
795 macro_assembler_ = macro_assembler;
796 List <RegExpNode*> work_list(0);
797 work_list_ = &work_list;
798 Label fail;
799 macro_assembler_->PushBacktrack(&fail);
800 Trace new_trace;
801 start->Emit(this, &new_trace);
802 macro_assembler_->Bind(&fail);
803 macro_assembler_->Fail();
804 while (!work_list.is_empty()) {
805 work_list.RemoveLast()->Emit(this, &new_trace);
806 }
807 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
808
809 Handle<Object> code = macro_assembler_->GetCode(pattern);
810
811 work_list_ = NULL;
812#ifdef DEBUG
813 if (FLAG_trace_regexp_assembler) {
814 delete macro_assembler_;
815 }
816#endif
817 return RegExpEngine::CompilationResult(*code, next_register_);
818}
819
820
821bool Trace::DeferredAction::Mentions(int that) {
822 if (type() == ActionNode::CLEAR_CAPTURES) {
823 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
824 return range.Contains(that);
825 } else {
826 return reg() == that;
827 }
828}
829
830
831bool Trace::mentions_reg(int reg) {
832 for (DeferredAction* action = actions_;
833 action != NULL;
834 action = action->next()) {
835 if (action->Mentions(reg))
836 return true;
837 }
838 return false;
839}
840
841
842bool Trace::GetStoredPosition(int reg, int* cp_offset) {
843 ASSERT_EQ(0, *cp_offset);
844 for (DeferredAction* action = actions_;
845 action != NULL;
846 action = action->next()) {
847 if (action->Mentions(reg)) {
848 if (action->type() == ActionNode::STORE_POSITION) {
849 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
850 return true;
851 } else {
852 return false;
853 }
854 }
855 }
856 return false;
857}
858
859
860int Trace::FindAffectedRegisters(OutSet* affected_registers) {
861 int max_register = RegExpCompiler::kNoRegister;
862 for (DeferredAction* action = actions_;
863 action != NULL;
864 action = action->next()) {
865 if (action->type() == ActionNode::CLEAR_CAPTURES) {
866 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
867 for (int i = range.from(); i <= range.to(); i++)
868 affected_registers->Set(i);
869 if (range.to() > max_register) max_register = range.to();
870 } else {
871 affected_registers->Set(action->reg());
872 if (action->reg() > max_register) max_register = action->reg();
873 }
874 }
875 return max_register;
876}
877
878
879void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
880 int max_register,
881 OutSet& registers_to_pop,
882 OutSet& registers_to_clear) {
883 for (int reg = max_register; reg >= 0; reg--) {
884 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
885 else if (registers_to_clear.Get(reg)) {
886 int clear_to = reg;
887 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
888 reg--;
889 }
890 assembler->ClearRegisters(reg, clear_to);
891 }
892 }
893}
894
895
896void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
897 int max_register,
898 OutSet& affected_registers,
899 OutSet* registers_to_pop,
900 OutSet* registers_to_clear) {
901 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
902 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
903
904 // Count pushes performed to force a stack limit check occasionally.
905 int pushes = 0;
906
907 for (int reg = 0; reg <= max_register; reg++) {
908 if (!affected_registers.Get(reg)) {
909 continue;
910 }
911
912 // The chronologically first deferred action in the trace
913 // is used to infer the action needed to restore a register
914 // to its previous state (or not, if it's safe to ignore it).
915 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
916 DeferredActionUndoType undo_action = IGNORE;
917
918 int value = 0;
919 bool absolute = false;
920 bool clear = false;
921 int store_position = -1;
922 // This is a little tricky because we are scanning the actions in reverse
923 // historical order (newest first).
924 for (DeferredAction* action = actions_;
925 action != NULL;
926 action = action->next()) {
927 if (action->Mentions(reg)) {
928 switch (action->type()) {
929 case ActionNode::SET_REGISTER: {
930 Trace::DeferredSetRegister* psr =
931 static_cast<Trace::DeferredSetRegister*>(action);
932 if (!absolute) {
933 value += psr->value();
934 absolute = true;
935 }
936 // SET_REGISTER is currently only used for newly introduced loop
937 // counters. They can have a significant previous value if they
938 // occour in a loop. TODO(lrn): Propagate this information, so
939 // we can set undo_action to IGNORE if we know there is no value to
940 // restore.
941 undo_action = RESTORE;
942 ASSERT_EQ(store_position, -1);
943 ASSERT(!clear);
944 break;
945 }
946 case ActionNode::INCREMENT_REGISTER:
947 if (!absolute) {
948 value++;
949 }
950 ASSERT_EQ(store_position, -1);
951 ASSERT(!clear);
952 undo_action = RESTORE;
953 break;
954 case ActionNode::STORE_POSITION: {
955 Trace::DeferredCapture* pc =
956 static_cast<Trace::DeferredCapture*>(action);
957 if (!clear && store_position == -1) {
958 store_position = pc->cp_offset();
959 }
960
961 // For captures we know that stores and clears alternate.
962 // Other register, are never cleared, and if the occur
963 // inside a loop, they might be assigned more than once.
964 if (reg <= 1) {
965 // Registers zero and one, aka "capture zero", is
966 // always set correctly if we succeed. There is no
967 // need to undo a setting on backtrack, because we
968 // will set it again or fail.
969 undo_action = IGNORE;
970 } else {
971 undo_action = pc->is_capture() ? CLEAR : RESTORE;
972 }
973 ASSERT(!absolute);
974 ASSERT_EQ(value, 0);
975 break;
976 }
977 case ActionNode::CLEAR_CAPTURES: {
978 // Since we're scanning in reverse order, if we've already
979 // set the position we have to ignore historically earlier
980 // clearing operations.
981 if (store_position == -1) {
982 clear = true;
983 }
984 undo_action = RESTORE;
985 ASSERT(!absolute);
986 ASSERT_EQ(value, 0);
987 break;
988 }
989 default:
990 UNREACHABLE();
991 break;
992 }
993 }
994 }
995 // Prepare for the undo-action (e.g., push if it's going to be popped).
996 if (undo_action == RESTORE) {
997 pushes++;
998 RegExpMacroAssembler::StackCheckFlag stack_check =
999 RegExpMacroAssembler::kNoStackLimitCheck;
1000 if (pushes == push_limit) {
1001 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1002 pushes = 0;
1003 }
1004
1005 assembler->PushRegister(reg, stack_check);
1006 registers_to_pop->Set(reg);
1007 } else if (undo_action == CLEAR) {
1008 registers_to_clear->Set(reg);
1009 }
1010 // Perform the chronologically last action (or accumulated increment)
1011 // for the register.
1012 if (store_position != -1) {
1013 assembler->WriteCurrentPositionToRegister(reg, store_position);
1014 } else if (clear) {
1015 assembler->ClearRegisters(reg, reg);
1016 } else if (absolute) {
1017 assembler->SetRegister(reg, value);
1018 } else if (value != 0) {
1019 assembler->AdvanceRegister(reg, value);
1020 }
1021 }
1022}
1023
1024
1025// This is called as we come into a loop choice node and some other tricky
1026// nodes. It normalizes the state of the code generator to ensure we can
1027// generate generic code.
1028void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1029 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1030
1031 ASSERT(!is_trivial());
1032
1033 if (actions_ == NULL && backtrack() == NULL) {
1034 // Here we just have some deferred cp advances to fix and we are back to
1035 // a normal situation. We may also have to forget some information gained
1036 // through a quick check that was already performed.
1037 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1038 // Create a new trivial state and generate the node with that.
1039 Trace new_state;
1040 successor->Emit(compiler, &new_state);
1041 return;
1042 }
1043
1044 // Generate deferred actions here along with code to undo them again.
1045 OutSet affected_registers;
1046
1047 if (backtrack() != NULL) {
1048 // Here we have a concrete backtrack location. These are set up by choice
1049 // nodes and so they indicate that we have a deferred save of the current
1050 // position which we may need to emit here.
1051 assembler->PushCurrentPosition();
1052 }
1053
1054 int max_register = FindAffectedRegisters(&affected_registers);
1055 OutSet registers_to_pop;
1056 OutSet registers_to_clear;
1057 PerformDeferredActions(assembler,
1058 max_register,
1059 affected_registers,
1060 &registers_to_pop,
1061 &registers_to_clear);
1062 if (cp_offset_ != 0) {
1063 assembler->AdvanceCurrentPosition(cp_offset_);
1064 }
1065
1066 // Create a new trivial state and generate the node with that.
1067 Label undo;
1068 assembler->PushBacktrack(&undo);
1069 Trace new_state;
1070 successor->Emit(compiler, &new_state);
1071
1072 // On backtrack we need to restore state.
1073 assembler->Bind(&undo);
1074 RestoreAffectedRegisters(assembler,
1075 max_register,
1076 registers_to_pop,
1077 registers_to_clear);
1078 if (backtrack() == NULL) {
1079 assembler->Backtrack();
1080 } else {
1081 assembler->PopCurrentPosition();
1082 assembler->GoTo(backtrack());
1083 }
1084}
1085
1086
1087void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1088 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1089
1090 // Omit flushing the trace. We discard the entire stack frame anyway.
1091
1092 if (!label()->is_bound()) {
1093 // We are completely independent of the trace, since we ignore it,
1094 // so this code can be used as the generic version.
1095 assembler->Bind(label());
1096 }
1097
1098 // Throw away everything on the backtrack stack since the start
1099 // of the negative submatch and restore the character position.
1100 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1101 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1102 if (clear_capture_count_ > 0) {
1103 // Clear any captures that might have been performed during the success
1104 // of the body of the negative look-ahead.
1105 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1106 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1107 }
1108 // Now that we have unwound the stack we find at the top of the stack the
1109 // backtrack that the BeginSubmatch node got.
1110 assembler->Backtrack();
1111}
1112
1113
1114void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1115 if (!trace->is_trivial()) {
1116 trace->Flush(compiler, this);
1117 return;
1118 }
1119 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1120 if (!label()->is_bound()) {
1121 assembler->Bind(label());
1122 }
1123 switch (action_) {
1124 case ACCEPT:
1125 assembler->Succeed();
1126 return;
1127 case BACKTRACK:
1128 assembler->GoTo(trace->backtrack());
1129 return;
1130 case NEGATIVE_SUBMATCH_SUCCESS:
1131 // This case is handled in a different virtual method.
1132 UNREACHABLE();
1133 }
1134 UNIMPLEMENTED();
1135}
1136
1137
1138void GuardedAlternative::AddGuard(Guard* guard) {
1139 if (guards_ == NULL)
1140 guards_ = new ZoneList<Guard*>(1);
1141 guards_->Add(guard);
1142}
1143
1144
1145ActionNode* ActionNode::SetRegister(int reg,
1146 int val,
1147 RegExpNode* on_success) {
1148 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
1149 result->data_.u_store_register.reg = reg;
1150 result->data_.u_store_register.value = val;
1151 return result;
1152}
1153
1154
1155ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1156 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1157 result->data_.u_increment_register.reg = reg;
1158 return result;
1159}
1160
1161
1162ActionNode* ActionNode::StorePosition(int reg,
1163 bool is_capture,
1164 RegExpNode* on_success) {
1165 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1166 result->data_.u_position_register.reg = reg;
1167 result->data_.u_position_register.is_capture = is_capture;
1168 return result;
1169}
1170
1171
1172ActionNode* ActionNode::ClearCaptures(Interval range,
1173 RegExpNode* on_success) {
1174 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1175 result->data_.u_clear_captures.range_from = range.from();
1176 result->data_.u_clear_captures.range_to = range.to();
1177 return result;
1178}
1179
1180
1181ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1182 int position_reg,
1183 RegExpNode* on_success) {
1184 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1185 result->data_.u_submatch.stack_pointer_register = stack_reg;
1186 result->data_.u_submatch.current_position_register = position_reg;
1187 return result;
1188}
1189
1190
1191ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1192 int position_reg,
1193 int clear_register_count,
1194 int clear_register_from,
1195 RegExpNode* on_success) {
1196 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1197 result->data_.u_submatch.stack_pointer_register = stack_reg;
1198 result->data_.u_submatch.current_position_register = position_reg;
1199 result->data_.u_submatch.clear_register_count = clear_register_count;
1200 result->data_.u_submatch.clear_register_from = clear_register_from;
1201 return result;
1202}
1203
1204
1205ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1206 int repetition_register,
1207 int repetition_limit,
1208 RegExpNode* on_success) {
1209 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1210 result->data_.u_empty_match_check.start_register = start_register;
1211 result->data_.u_empty_match_check.repetition_register = repetition_register;
1212 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1213 return result;
1214}
1215
1216
1217#define DEFINE_ACCEPT(Type) \
1218 void Type##Node::Accept(NodeVisitor* visitor) { \
1219 visitor->Visit##Type(this); \
1220 }
1221FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1222#undef DEFINE_ACCEPT
1223
1224
1225void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1226 visitor->VisitLoopChoice(this);
1227}
1228
1229
1230// -------------------------------------------------------------------
1231// Emit code.
1232
1233
1234void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1235 Guard* guard,
1236 Trace* trace) {
1237 switch (guard->op()) {
1238 case Guard::LT:
1239 ASSERT(!trace->mentions_reg(guard->reg()));
1240 macro_assembler->IfRegisterGE(guard->reg(),
1241 guard->value(),
1242 trace->backtrack());
1243 break;
1244 case Guard::GEQ:
1245 ASSERT(!trace->mentions_reg(guard->reg()));
1246 macro_assembler->IfRegisterLT(guard->reg(),
1247 guard->value(),
1248 trace->backtrack());
1249 break;
1250 }
1251}
1252
1253
1254static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1255static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1256
1257
1258// Returns the number of characters in the equivalence class, omitting those
1259// that cannot occur in the source string because it is ASCII.
1260static int GetCaseIndependentLetters(uc16 character,
1261 bool ascii_subject,
1262 unibrow::uchar* letters) {
1263 int length = uncanonicalize.get(character, '\0', letters);
1264 // Unibrow returns 0 or 1 for characters where case independependence is
1265 // trivial.
1266 if (length == 0) {
1267 letters[0] = character;
1268 length = 1;
1269 }
1270 if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1271 return length;
1272 }
1273 // The standard requires that non-ASCII characters cannot have ASCII
1274 // character codes in their equivalence class.
1275 return 0;
1276}
1277
1278
1279static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
1280 uc16 c,
1281 Label* on_failure,
1282 int cp_offset,
1283 bool check,
1284 bool preloaded) {
1285 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1286 bool bound_checked = false;
1287 if (!preloaded) {
1288 assembler->LoadCurrentCharacter(
1289 cp_offset,
1290 on_failure,
1291 check);
1292 bound_checked = true;
1293 }
1294 assembler->CheckNotCharacter(c, on_failure);
1295 return bound_checked;
1296}
1297
1298
1299// Only emits non-letters (things that don't have case). Only used for case
1300// independent matches.
1301static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
1302 uc16 c,
1303 Label* on_failure,
1304 int cp_offset,
1305 bool check,
1306 bool preloaded) {
1307 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1308 bool ascii = compiler->ascii();
1309 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1310 int length = GetCaseIndependentLetters(c, ascii, chars);
1311 if (length < 1) {
1312 // This can't match. Must be an ASCII subject and a non-ASCII character.
1313 // We do not need to do anything since the ASCII pass already handled this.
1314 return false; // Bounds not checked.
1315 }
1316 bool checked = false;
1317 // We handle the length > 1 case in a later pass.
1318 if (length == 1) {
1319 if (ascii && c > String::kMaxAsciiCharCodeU) {
1320 // Can't match - see above.
1321 return false; // Bounds not checked.
1322 }
1323 if (!preloaded) {
1324 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1325 checked = check;
1326 }
1327 macro_assembler->CheckNotCharacter(c, on_failure);
1328 }
1329 return checked;
1330}
1331
1332
1333static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1334 bool ascii,
1335 uc16 c1,
1336 uc16 c2,
1337 Label* on_failure) {
1338 uc16 char_mask;
1339 if (ascii) {
1340 char_mask = String::kMaxAsciiCharCode;
1341 } else {
1342 char_mask = String::kMaxUC16CharCode;
1343 }
1344 uc16 exor = c1 ^ c2;
1345 // Check whether exor has only one bit set.
1346 if (((exor - 1) & exor) == 0) {
1347 // If c1 and c2 differ only by one bit.
1348 // Ecma262UnCanonicalize always gives the highest number last.
1349 ASSERT(c2 > c1);
1350 uc16 mask = char_mask ^ exor;
1351 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1352 return true;
1353 }
1354 ASSERT(c2 > c1);
1355 uc16 diff = c2 - c1;
1356 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1357 // If the characters differ by 2^n but don't differ by one bit then
1358 // subtract the difference from the found character, then do the or
1359 // trick. We avoid the theoretical case where negative numbers are
1360 // involved in order to simplify code generation.
1361 uc16 mask = char_mask ^ diff;
1362 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1363 diff,
1364 mask,
1365 on_failure);
1366 return true;
1367 }
1368 return false;
1369}
1370
1371
1372typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
1373 uc16 c,
1374 Label* on_failure,
1375 int cp_offset,
1376 bool check,
1377 bool preloaded);
1378
1379// Only emits letters (things that have case). Only used for case independent
1380// matches.
1381static inline bool EmitAtomLetter(RegExpCompiler* compiler,
1382 uc16 c,
1383 Label* on_failure,
1384 int cp_offset,
1385 bool check,
1386 bool preloaded) {
1387 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1388 bool ascii = compiler->ascii();
1389 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1390 int length = GetCaseIndependentLetters(c, ascii, chars);
1391 if (length <= 1) return false;
1392 // We may not need to check against the end of the input string
1393 // if this character lies before a character that matched.
1394 if (!preloaded) {
1395 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1396 }
1397 Label ok;
1398 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1399 switch (length) {
1400 case 2: {
1401 if (ShortCutEmitCharacterPair(macro_assembler,
1402 ascii,
1403 chars[0],
1404 chars[1],
1405 on_failure)) {
1406 } else {
1407 macro_assembler->CheckCharacter(chars[0], &ok);
1408 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1409 macro_assembler->Bind(&ok);
1410 }
1411 break;
1412 }
1413 case 4:
1414 macro_assembler->CheckCharacter(chars[3], &ok);
1415 // Fall through!
1416 case 3:
1417 macro_assembler->CheckCharacter(chars[0], &ok);
1418 macro_assembler->CheckCharacter(chars[1], &ok);
1419 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1420 macro_assembler->Bind(&ok);
1421 break;
1422 default:
1423 UNREACHABLE();
1424 break;
1425 }
1426 return true;
1427}
1428
1429
1430static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1431 RegExpCharacterClass* cc,
1432 bool ascii,
1433 Label* on_failure,
1434 int cp_offset,
1435 bool check_offset,
1436 bool preloaded) {
1437 if (cc->is_standard() &&
1438 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1439 cp_offset,
1440 check_offset,
1441 on_failure)) {
1442 return;
1443 }
1444
1445 ZoneList<CharacterRange>* ranges = cc->ranges();
1446 int max_char;
1447 if (ascii) {
1448 max_char = String::kMaxAsciiCharCode;
1449 } else {
1450 max_char = String::kMaxUC16CharCode;
1451 }
1452
1453 Label success;
1454
1455 Label* char_is_in_class =
1456 cc->is_negated() ? on_failure : &success;
1457
1458 int range_count = ranges->length();
1459
1460 int last_valid_range = range_count - 1;
1461 while (last_valid_range >= 0) {
1462 CharacterRange& range = ranges->at(last_valid_range);
1463 if (range.from() <= max_char) {
1464 break;
1465 }
1466 last_valid_range--;
1467 }
1468
1469 if (last_valid_range < 0) {
1470 if (!cc->is_negated()) {
1471 // TODO(plesner): We can remove this when the node level does our
1472 // ASCII optimizations for us.
1473 macro_assembler->GoTo(on_failure);
1474 }
1475 if (check_offset) {
1476 macro_assembler->CheckPosition(cp_offset, on_failure);
1477 }
1478 return;
1479 }
1480
1481 if (last_valid_range == 0 &&
1482 !cc->is_negated() &&
1483 ranges->at(0).IsEverything(max_char)) {
1484 // This is a common case hit by non-anchored expressions.
1485 if (check_offset) {
1486 macro_assembler->CheckPosition(cp_offset, on_failure);
1487 }
1488 return;
1489 }
1490
1491 if (!preloaded) {
1492 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1493 }
1494
1495 for (int i = 0; i < last_valid_range; i++) {
1496 CharacterRange& range = ranges->at(i);
1497 Label next_range;
1498 uc16 from = range.from();
1499 uc16 to = range.to();
1500 if (from > max_char) {
1501 continue;
1502 }
1503 if (to > max_char) to = max_char;
1504 if (to == from) {
1505 macro_assembler->CheckCharacter(to, char_is_in_class);
1506 } else {
1507 if (from != 0) {
1508 macro_assembler->CheckCharacterLT(from, &next_range);
1509 }
1510 if (to != max_char) {
1511 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1512 } else {
1513 macro_assembler->GoTo(char_is_in_class);
1514 }
1515 }
1516 macro_assembler->Bind(&next_range);
1517 }
1518
1519 CharacterRange& range = ranges->at(last_valid_range);
1520 uc16 from = range.from();
1521 uc16 to = range.to();
1522
1523 if (to > max_char) to = max_char;
1524 ASSERT(to >= from);
1525
1526 if (to == from) {
1527 if (cc->is_negated()) {
1528 macro_assembler->CheckCharacter(to, on_failure);
1529 } else {
1530 macro_assembler->CheckNotCharacter(to, on_failure);
1531 }
1532 } else {
1533 if (from != 0) {
1534 if (cc->is_negated()) {
1535 macro_assembler->CheckCharacterLT(from, &success);
1536 } else {
1537 macro_assembler->CheckCharacterLT(from, on_failure);
1538 }
1539 }
1540 if (to != String::kMaxUC16CharCode) {
1541 if (cc->is_negated()) {
1542 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1543 } else {
1544 macro_assembler->CheckCharacterGT(to, on_failure);
1545 }
1546 } else {
1547 if (cc->is_negated()) {
1548 macro_assembler->GoTo(on_failure);
1549 }
1550 }
1551 }
1552 macro_assembler->Bind(&success);
1553}
1554
1555
1556RegExpNode::~RegExpNode() {
1557}
1558
1559
1560RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1561 Trace* trace) {
1562 // If we are generating a greedy loop then don't stop and don't reuse code.
1563 if (trace->stop_node() != NULL) {
1564 return CONTINUE;
1565 }
1566
1567 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1568 if (trace->is_trivial()) {
1569 if (label_.is_bound()) {
1570 // We are being asked to generate a generic version, but that's already
1571 // been done so just go to it.
1572 macro_assembler->GoTo(&label_);
1573 return DONE;
1574 }
1575 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1576 // To avoid too deep recursion we push the node to the work queue and just
1577 // generate a goto here.
1578 compiler->AddWork(this);
1579 macro_assembler->GoTo(&label_);
1580 return DONE;
1581 }
1582 // Generate generic version of the node and bind the label for later use.
1583 macro_assembler->Bind(&label_);
1584 return CONTINUE;
1585 }
1586
1587 // We are being asked to make a non-generic version. Keep track of how many
1588 // non-generic versions we generate so as not to overdo it.
1589 trace_count_++;
1590 if (FLAG_regexp_optimization &&
1591 trace_count_ < kMaxCopiesCodeGenerated &&
1592 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1593 return CONTINUE;
1594 }
1595
1596 // If we get here code has been generated for this node too many times or
1597 // recursion is too deep. Time to switch to a generic version. The code for
1598 // generic versions above can handle deep recursion properly.
1599 trace->Flush(compiler, this);
1600 return DONE;
1601}
1602
1603
1604int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1605 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1606 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
1607 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1608}
1609
1610
1611int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1612 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1613 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1614}
1615
1616
1617int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1618 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1619 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1620}
1621
1622
1623int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1624 int answer = Length();
1625 if (answer >= still_to_find) return answer;
1626 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
1627 return answer + on_success()->EatsAtLeast(still_to_find - answer,
1628 recursion_depth + 1);
1629}
1630
1631
1632int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
1633 int recursion_depth) {
1634 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1635 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1636 // afterwards.
1637 RegExpNode* node = alternatives_->at(1).node();
1638 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
1639}
1640
1641
1642void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1643 QuickCheckDetails* details,
1644 RegExpCompiler* compiler,
1645 int filled_in,
1646 bool not_at_start) {
1647 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1648 // afterwards.
1649 RegExpNode* node = alternatives_->at(1).node();
1650 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1651}
1652
1653
1654int ChoiceNode::EatsAtLeastHelper(int still_to_find,
1655 int recursion_depth,
1656 RegExpNode* ignore_this_node) {
1657 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1658 int min = 100;
1659 int choice_count = alternatives_->length();
1660 for (int i = 0; i < choice_count; i++) {
1661 RegExpNode* node = alternatives_->at(i).node();
1662 if (node == ignore_this_node) continue;
1663 int node_eats_at_least = node->EatsAtLeast(still_to_find,
1664 recursion_depth + 1);
1665 if (node_eats_at_least < min) min = node_eats_at_least;
1666 }
1667 return min;
1668}
1669
1670
1671int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1672 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
1673}
1674
1675
1676int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1677 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
1678}
1679
1680
1681// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1682static inline uint32_t SmearBitsRight(uint32_t v) {
1683 v |= v >> 1;
1684 v |= v >> 2;
1685 v |= v >> 4;
1686 v |= v >> 8;
1687 v |= v >> 16;
1688 return v;
1689}
1690
1691
1692bool QuickCheckDetails::Rationalize(bool asc) {
1693 bool found_useful_op = false;
1694 uint32_t char_mask;
1695 if (asc) {
1696 char_mask = String::kMaxAsciiCharCode;
1697 } else {
1698 char_mask = String::kMaxUC16CharCode;
1699 }
1700 mask_ = 0;
1701 value_ = 0;
1702 int char_shift = 0;
1703 for (int i = 0; i < characters_; i++) {
1704 Position* pos = &positions_[i];
1705 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1706 found_useful_op = true;
1707 }
1708 mask_ |= (pos->mask & char_mask) << char_shift;
1709 value_ |= (pos->value & char_mask) << char_shift;
1710 char_shift += asc ? 8 : 16;
1711 }
1712 return found_useful_op;
1713}
1714
1715
1716bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1717 Trace* trace,
1718 bool preload_has_checked_bounds,
1719 Label* on_possible_success,
1720 QuickCheckDetails* details,
1721 bool fall_through_on_failure) {
1722 if (details->characters() == 0) return false;
1723 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
1724 if (details->cannot_match()) return false;
1725 if (!details->Rationalize(compiler->ascii())) return false;
1726 ASSERT(details->characters() == 1 ||
1727 compiler->macro_assembler()->CanReadUnaligned());
1728 uint32_t mask = details->mask();
1729 uint32_t value = details->value();
1730
1731 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1732
1733 if (trace->characters_preloaded() != details->characters()) {
1734 assembler->LoadCurrentCharacter(trace->cp_offset(),
1735 trace->backtrack(),
1736 !preload_has_checked_bounds,
1737 details->characters());
1738 }
1739
1740
1741 bool need_mask = true;
1742
1743 if (details->characters() == 1) {
1744 // If number of characters preloaded is 1 then we used a byte or 16 bit
1745 // load so the value is already masked down.
1746 uint32_t char_mask;
1747 if (compiler->ascii()) {
1748 char_mask = String::kMaxAsciiCharCode;
1749 } else {
1750 char_mask = String::kMaxUC16CharCode;
1751 }
1752 if ((mask & char_mask) == char_mask) need_mask = false;
1753 mask &= char_mask;
1754 } else {
1755 // For 2-character preloads in ASCII mode we also use a 16 bit load with
1756 // zero extend.
1757 if (details->characters() == 2 && compiler->ascii()) {
1758 if ((mask & 0xffff) == 0xffff) need_mask = false;
1759 } else {
1760 if (mask == 0xffffffff) need_mask = false;
1761 }
1762 }
1763
1764 if (fall_through_on_failure) {
1765 if (need_mask) {
1766 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1767 } else {
1768 assembler->CheckCharacter(value, on_possible_success);
1769 }
1770 } else {
1771 if (need_mask) {
1772 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1773 } else {
1774 assembler->CheckNotCharacter(value, trace->backtrack());
1775 }
1776 }
1777 return true;
1778}
1779
1780
1781// Here is the meat of GetQuickCheckDetails (see also the comment on the
1782// super-class in the .h file).
1783//
1784// We iterate along the text object, building up for each character a
1785// mask and value that can be used to test for a quick failure to match.
1786// The masks and values for the positions will be combined into a single
1787// machine word for the current character width in order to be used in
1788// generating a quick check.
1789void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1790 RegExpCompiler* compiler,
1791 int characters_filled_in,
1792 bool not_at_start) {
1793 ASSERT(characters_filled_in < details->characters());
1794 int characters = details->characters();
1795 int char_mask;
1796 int char_shift;
1797 if (compiler->ascii()) {
1798 char_mask = String::kMaxAsciiCharCode;
1799 char_shift = 8;
1800 } else {
1801 char_mask = String::kMaxUC16CharCode;
1802 char_shift = 16;
1803 }
1804 for (int k = 0; k < elms_->length(); k++) {
1805 TextElement elm = elms_->at(k);
1806 if (elm.type == TextElement::ATOM) {
1807 Vector<const uc16> quarks = elm.data.u_atom->data();
1808 for (int i = 0; i < characters && i < quarks.length(); i++) {
1809 QuickCheckDetails::Position* pos =
1810 details->positions(characters_filled_in);
1811 uc16 c = quarks[i];
1812 if (c > char_mask) {
1813 // If we expect a non-ASCII character from an ASCII string,
1814 // there is no way we can match. Not even case independent
1815 // matching can turn an ASCII character into non-ASCII or
1816 // vice versa.
1817 details->set_cannot_match();
1818 pos->determines_perfectly = false;
1819 return;
1820 }
1821 if (compiler->ignore_case()) {
1822 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1823 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
1824 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1825 if (length == 1) {
1826 // This letter has no case equivalents, so it's nice and simple
1827 // and the mask-compare will determine definitely whether we have
1828 // a match at this character position.
1829 pos->mask = char_mask;
1830 pos->value = c;
1831 pos->determines_perfectly = true;
1832 } else {
1833 uint32_t common_bits = char_mask;
1834 uint32_t bits = chars[0];
1835 for (int j = 1; j < length; j++) {
1836 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1837 common_bits ^= differing_bits;
1838 bits &= common_bits;
1839 }
1840 // If length is 2 and common bits has only one zero in it then
1841 // our mask and compare instruction will determine definitely
1842 // whether we have a match at this character position. Otherwise
1843 // it can only be an approximate check.
1844 uint32_t one_zero = (common_bits | ~char_mask);
1845 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1846 pos->determines_perfectly = true;
1847 }
1848 pos->mask = common_bits;
1849 pos->value = bits;
1850 }
1851 } else {
1852 // Don't ignore case. Nice simple case where the mask-compare will
1853 // determine definitely whether we have a match at this character
1854 // position.
1855 pos->mask = char_mask;
1856 pos->value = c;
1857 pos->determines_perfectly = true;
1858 }
1859 characters_filled_in++;
1860 ASSERT(characters_filled_in <= details->characters());
1861 if (characters_filled_in == details->characters()) {
1862 return;
1863 }
1864 }
1865 } else {
1866 QuickCheckDetails::Position* pos =
1867 details->positions(characters_filled_in);
1868 RegExpCharacterClass* tree = elm.data.u_char_class;
1869 ZoneList<CharacterRange>* ranges = tree->ranges();
1870 if (tree->is_negated()) {
1871 // A quick check uses multi-character mask and compare. There is no
1872 // useful way to incorporate a negative char class into this scheme
1873 // so we just conservatively create a mask and value that will always
1874 // succeed.
1875 pos->mask = 0;
1876 pos->value = 0;
1877 } else {
1878 int first_range = 0;
1879 while (ranges->at(first_range).from() > char_mask) {
1880 first_range++;
1881 if (first_range == ranges->length()) {
1882 details->set_cannot_match();
1883 pos->determines_perfectly = false;
1884 return;
1885 }
1886 }
1887 CharacterRange range = ranges->at(first_range);
1888 uc16 from = range.from();
1889 uc16 to = range.to();
1890 if (to > char_mask) {
1891 to = char_mask;
1892 }
1893 uint32_t differing_bits = (from ^ to);
1894 // A mask and compare is only perfect if the differing bits form a
1895 // number like 00011111 with one single block of trailing 1s.
1896 if ((differing_bits & (differing_bits + 1)) == 0 &&
1897 from + differing_bits == to) {
1898 pos->determines_perfectly = true;
1899 }
1900 uint32_t common_bits = ~SmearBitsRight(differing_bits);
1901 uint32_t bits = (from & common_bits);
1902 for (int i = first_range + 1; i < ranges->length(); i++) {
1903 CharacterRange range = ranges->at(i);
1904 uc16 from = range.from();
1905 uc16 to = range.to();
1906 if (from > char_mask) continue;
1907 if (to > char_mask) to = char_mask;
1908 // Here we are combining more ranges into the mask and compare
1909 // value. With each new range the mask becomes more sparse and
1910 // so the chances of a false positive rise. A character class
1911 // with multiple ranges is assumed never to be equivalent to a
1912 // mask and compare operation.
1913 pos->determines_perfectly = false;
1914 uint32_t new_common_bits = (from ^ to);
1915 new_common_bits = ~SmearBitsRight(new_common_bits);
1916 common_bits &= new_common_bits;
1917 bits &= new_common_bits;
1918 uint32_t differing_bits = (from & common_bits) ^ bits;
1919 common_bits ^= differing_bits;
1920 bits &= common_bits;
1921 }
1922 pos->mask = common_bits;
1923 pos->value = bits;
1924 }
1925 characters_filled_in++;
1926 ASSERT(characters_filled_in <= details->characters());
1927 if (characters_filled_in == details->characters()) {
1928 return;
1929 }
1930 }
1931 }
1932 ASSERT(characters_filled_in != details->characters());
1933 on_success()-> GetQuickCheckDetails(details,
1934 compiler,
1935 characters_filled_in,
1936 true);
1937}
1938
1939
1940void QuickCheckDetails::Clear() {
1941 for (int i = 0; i < characters_; i++) {
1942 positions_[i].mask = 0;
1943 positions_[i].value = 0;
1944 positions_[i].determines_perfectly = false;
1945 }
1946 characters_ = 0;
1947}
1948
1949
1950void QuickCheckDetails::Advance(int by, bool ascii) {
1951 ASSERT(by >= 0);
1952 if (by >= characters_) {
1953 Clear();
1954 return;
1955 }
1956 for (int i = 0; i < characters_ - by; i++) {
1957 positions_[i] = positions_[by + i];
1958 }
1959 for (int i = characters_ - by; i < characters_; i++) {
1960 positions_[i].mask = 0;
1961 positions_[i].value = 0;
1962 positions_[i].determines_perfectly = false;
1963 }
1964 characters_ -= by;
1965 // We could change mask_ and value_ here but we would never advance unless
1966 // they had already been used in a check and they won't be used again because
1967 // it would gain us nothing. So there's no point.
1968}
1969
1970
1971void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1972 ASSERT(characters_ == other->characters_);
1973 if (other->cannot_match_) {
1974 return;
1975 }
1976 if (cannot_match_) {
1977 *this = *other;
1978 return;
1979 }
1980 for (int i = from_index; i < characters_; i++) {
1981 QuickCheckDetails::Position* pos = positions(i);
1982 QuickCheckDetails::Position* other_pos = other->positions(i);
1983 if (pos->mask != other_pos->mask ||
1984 pos->value != other_pos->value ||
1985 !other_pos->determines_perfectly) {
1986 // Our mask-compare operation will be approximate unless we have the
1987 // exact same operation on both sides of the alternation.
1988 pos->determines_perfectly = false;
1989 }
1990 pos->mask &= other_pos->mask;
1991 pos->value &= pos->mask;
1992 other_pos->value &= pos->mask;
1993 uc16 differing_bits = (pos->value ^ other_pos->value);
1994 pos->mask &= ~differing_bits;
1995 pos->value &= pos->mask;
1996 }
1997}
1998
1999
2000class VisitMarker {
2001 public:
2002 explicit VisitMarker(NodeInfo* info) : info_(info) {
2003 ASSERT(!info->visited);
2004 info->visited = true;
2005 }
2006 ~VisitMarker() {
2007 info_->visited = false;
2008 }
2009 private:
2010 NodeInfo* info_;
2011};
2012
2013
2014void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2015 RegExpCompiler* compiler,
2016 int characters_filled_in,
2017 bool not_at_start) {
2018 if (body_can_be_zero_length_ || info()->visited) return;
2019 VisitMarker marker(info());
2020 return ChoiceNode::GetQuickCheckDetails(details,
2021 compiler,
2022 characters_filled_in,
2023 not_at_start);
2024}
2025
2026
2027void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2028 RegExpCompiler* compiler,
2029 int characters_filled_in,
2030 bool not_at_start) {
2031 not_at_start = (not_at_start || not_at_start_);
2032 int choice_count = alternatives_->length();
2033 ASSERT(choice_count > 0);
2034 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2035 compiler,
2036 characters_filled_in,
2037 not_at_start);
2038 for (int i = 1; i < choice_count; i++) {
2039 QuickCheckDetails new_details(details->characters());
2040 RegExpNode* node = alternatives_->at(i).node();
2041 node->GetQuickCheckDetails(&new_details, compiler,
2042 characters_filled_in,
2043 not_at_start);
2044 // Here we merge the quick match details of the two branches.
2045 details->Merge(&new_details, characters_filled_in);
2046 }
2047}
2048
2049
2050// Check for [0-9A-Z_a-z].
2051static void EmitWordCheck(RegExpMacroAssembler* assembler,
2052 Label* word,
2053 Label* non_word,
2054 bool fall_through_on_word) {
2055 assembler->CheckCharacterGT('z', non_word);
2056 assembler->CheckCharacterLT('0', non_word);
2057 assembler->CheckCharacterGT('a' - 1, word);
2058 assembler->CheckCharacterLT('9' + 1, word);
2059 assembler->CheckCharacterLT('A', non_word);
2060 assembler->CheckCharacterLT('Z' + 1, word);
2061 if (fall_through_on_word) {
2062 assembler->CheckNotCharacter('_', non_word);
2063 } else {
2064 assembler->CheckCharacter('_', word);
2065 }
2066}
2067
2068
2069// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2070// that matches newline or the start of input).
2071static void EmitHat(RegExpCompiler* compiler,
2072 RegExpNode* on_success,
2073 Trace* trace) {
2074 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2075 // We will be loading the previous character into the current character
2076 // register.
2077 Trace new_trace(*trace);
2078 new_trace.InvalidateCurrentCharacter();
2079
2080 Label ok;
2081 if (new_trace.cp_offset() == 0) {
2082 // The start of input counts as a newline in this context, so skip to
2083 // ok if we are at the start.
2084 assembler->CheckAtStart(&ok);
2085 }
2086 // We already checked that we are not at the start of input so it must be
2087 // OK to load the previous character.
2088 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2089 new_trace.backtrack(),
2090 false);
2091 // Newline means \n, \r, 0x2028 or 0x2029.
2092 if (!compiler->ascii()) {
2093 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2094 }
2095 assembler->CheckCharacter('\n', &ok);
2096 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2097 assembler->Bind(&ok);
2098 on_success->Emit(compiler, &new_trace);
2099}
2100
2101
2102// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2103static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2104 RegExpCompiler* compiler,
2105 RegExpNode* on_success,
2106 Trace* trace) {
2107 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2108 Label before_non_word;
2109 Label before_word;
2110 if (trace->characters_preloaded() != 1) {
2111 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2112 }
2113 // Fall through on non-word.
2114 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2115
2116 // We will be loading the previous character into the current character
2117 // register.
2118 Trace new_trace(*trace);
2119 new_trace.InvalidateCurrentCharacter();
2120
2121 Label ok;
2122 Label* boundary;
2123 Label* not_boundary;
2124 if (type == AssertionNode::AT_BOUNDARY) {
2125 boundary = &ok;
2126 not_boundary = new_trace.backtrack();
2127 } else {
2128 not_boundary = &ok;
2129 boundary = new_trace.backtrack();
2130 }
2131
2132 // Next character is not a word character.
2133 assembler->Bind(&before_non_word);
2134 if (new_trace.cp_offset() == 0) {
2135 // The start of input counts as a non-word character, so the question is
2136 // decided if we are at the start.
2137 assembler->CheckAtStart(not_boundary);
2138 }
2139 // We already checked that we are not at the start of input so it must be
2140 // OK to load the previous character.
2141 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2142 &ok, // Unused dummy label in this call.
2143 false);
2144 // Fall through on non-word.
2145 EmitWordCheck(assembler, boundary, not_boundary, false);
2146 assembler->GoTo(not_boundary);
2147
2148 // Next character is a word character.
2149 assembler->Bind(&before_word);
2150 if (new_trace.cp_offset() == 0) {
2151 // The start of input counts as a non-word character, so the question is
2152 // decided if we are at the start.
2153 assembler->CheckAtStart(boundary);
2154 }
2155 // We already checked that we are not at the start of input so it must be
2156 // OK to load the previous character.
2157 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2158 &ok, // Unused dummy label in this call.
2159 false);
2160 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2161 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2162
2163 assembler->Bind(&ok);
2164
2165 on_success->Emit(compiler, &new_trace);
2166}
2167
2168
2169void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2170 RegExpCompiler* compiler,
2171 int filled_in,
2172 bool not_at_start) {
2173 if (type_ == AT_START && not_at_start) {
2174 details->set_cannot_match();
2175 return;
2176 }
2177 return on_success()->GetQuickCheckDetails(details,
2178 compiler,
2179 filled_in,
2180 not_at_start);
2181}
2182
2183
2184void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2185 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2186 switch (type_) {
2187 case AT_END: {
2188 Label ok;
2189 assembler->CheckPosition(trace->cp_offset(), &ok);
2190 assembler->GoTo(trace->backtrack());
2191 assembler->Bind(&ok);
2192 break;
2193 }
2194 case AT_START: {
2195 if (trace->at_start() == Trace::FALSE) {
2196 assembler->GoTo(trace->backtrack());
2197 return;
2198 }
2199 if (trace->at_start() == Trace::UNKNOWN) {
2200 assembler->CheckNotAtStart(trace->backtrack());
2201 Trace at_start_trace = *trace;
2202 at_start_trace.set_at_start(true);
2203 on_success()->Emit(compiler, &at_start_trace);
2204 return;
2205 }
2206 }
2207 break;
2208 case AFTER_NEWLINE:
2209 EmitHat(compiler, on_success(), trace);
2210 return;
2211 case AT_NON_BOUNDARY:
2212 case AT_BOUNDARY:
2213 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2214 return;
2215 }
2216 on_success()->Emit(compiler, trace);
2217}
2218
2219
2220static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2221 if (quick_check == NULL) return false;
2222 if (offset >= quick_check->characters()) return false;
2223 return quick_check->positions(offset)->determines_perfectly;
2224}
2225
2226
2227static void UpdateBoundsCheck(int index, int* checked_up_to) {
2228 if (index > *checked_up_to) {
2229 *checked_up_to = index;
2230 }
2231}
2232
2233
2234// We call this repeatedly to generate code for each pass over the text node.
2235// The passes are in increasing order of difficulty because we hope one
2236// of the first passes will fail in which case we are saved the work of the
2237// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2238// we will check the '%' in the first pass, the case independent 'a' in the
2239// second pass and the character class in the last pass.
2240//
2241// The passes are done from right to left, so for example to test for /bar/
2242// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2243// and then a 'b' with offset 0. This means we can avoid the end-of-input
2244// bounds check most of the time. In the example we only need to check for
2245// end-of-input when loading the putative 'r'.
2246//
2247// A slight complication involves the fact that the first character may already
2248// be fetched into a register by the previous node. In this case we want to
2249// do the test for that character first. We do this in separate passes. The
2250// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2251// pass has been performed then subsequent passes will have true in
2252// first_element_checked to indicate that that character does not need to be
2253// checked again.
2254//
2255// In addition to all this we are passed a Trace, which can
2256// contain an AlternativeGeneration object. In this AlternativeGeneration
2257// object we can see details of any quick check that was already passed in
2258// order to get to the code we are now generating. The quick check can involve
2259// loading characters, which means we do not need to recheck the bounds
2260// up to the limit the quick check already checked. In addition the quick
2261// check can have involved a mask and compare operation which may simplify
2262// or obviate the need for further checks at some character positions.
2263void TextNode::TextEmitPass(RegExpCompiler* compiler,
2264 TextEmitPassType pass,
2265 bool preloaded,
2266 Trace* trace,
2267 bool first_element_checked,
2268 int* checked_up_to) {
2269 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2270 bool ascii = compiler->ascii();
2271 Label* backtrack = trace->backtrack();
2272 QuickCheckDetails* quick_check = trace->quick_check_performed();
2273 int element_count = elms_->length();
2274 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2275 TextElement elm = elms_->at(i);
2276 int cp_offset = trace->cp_offset() + elm.cp_offset;
2277 if (elm.type == TextElement::ATOM) {
2278 Vector<const uc16> quarks = elm.data.u_atom->data();
2279 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2280 if (first_element_checked && i == 0 && j == 0) continue;
2281 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2282 EmitCharacterFunction* emit_function = NULL;
2283 switch (pass) {
2284 case NON_ASCII_MATCH:
2285 ASSERT(ascii);
2286 if (quarks[j] > String::kMaxAsciiCharCode) {
2287 assembler->GoTo(backtrack);
2288 return;
2289 }
2290 break;
2291 case NON_LETTER_CHARACTER_MATCH:
2292 emit_function = &EmitAtomNonLetter;
2293 break;
2294 case SIMPLE_CHARACTER_MATCH:
2295 emit_function = &EmitSimpleCharacter;
2296 break;
2297 case CASE_CHARACTER_MATCH:
2298 emit_function = &EmitAtomLetter;
2299 break;
2300 default:
2301 break;
2302 }
2303 if (emit_function != NULL) {
2304 bool bound_checked = emit_function(compiler,
2305 quarks[j],
2306 backtrack,
2307 cp_offset + j,
2308 *checked_up_to < cp_offset + j,
2309 preloaded);
2310 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2311 }
2312 }
2313 } else {
2314 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2315 if (pass == CHARACTER_CLASS_MATCH) {
2316 if (first_element_checked && i == 0) continue;
2317 if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
2318 RegExpCharacterClass* cc = elm.data.u_char_class;
2319 EmitCharClass(assembler,
2320 cc,
2321 ascii,
2322 backtrack,
2323 cp_offset,
2324 *checked_up_to < cp_offset,
2325 preloaded);
2326 UpdateBoundsCheck(cp_offset, checked_up_to);
2327 }
2328 }
2329 }
2330}
2331
2332
2333int TextNode::Length() {
2334 TextElement elm = elms_->last();
2335 ASSERT(elm.cp_offset >= 0);
2336 if (elm.type == TextElement::ATOM) {
2337 return elm.cp_offset + elm.data.u_atom->data().length();
2338 } else {
2339 return elm.cp_offset + 1;
2340 }
2341}
2342
2343
2344bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2345 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2346 if (ignore_case) {
2347 return pass == SIMPLE_CHARACTER_MATCH;
2348 } else {
2349 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2350 }
2351}
2352
2353
2354// This generates the code to match a text node. A text node can contain
2355// straight character sequences (possibly to be matched in a case-independent
2356// way) and character classes. For efficiency we do not do this in a single
2357// pass from left to right. Instead we pass over the text node several times,
2358// emitting code for some character positions every time. See the comment on
2359// TextEmitPass for details.
2360void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2361 LimitResult limit_result = LimitVersions(compiler, trace);
2362 if (limit_result == DONE) return;
2363 ASSERT(limit_result == CONTINUE);
2364
2365 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2366 compiler->SetRegExpTooBig();
2367 return;
2368 }
2369
2370 if (compiler->ascii()) {
2371 int dummy = 0;
2372 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
2373 }
2374
2375 bool first_elt_done = false;
2376 int bound_checked_to = trace->cp_offset() - 1;
2377 bound_checked_to += trace->bound_checked_up_to();
2378
2379 // If a character is preloaded into the current character register then
2380 // check that now.
2381 if (trace->characters_preloaded() == 1) {
2382 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2383 if (!SkipPass(pass, compiler->ignore_case())) {
2384 TextEmitPass(compiler,
2385 static_cast<TextEmitPassType>(pass),
2386 true,
2387 trace,
2388 false,
2389 &bound_checked_to);
2390 }
2391 }
2392 first_elt_done = true;
2393 }
2394
2395 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2396 if (!SkipPass(pass, compiler->ignore_case())) {
2397 TextEmitPass(compiler,
2398 static_cast<TextEmitPassType>(pass),
2399 false,
2400 trace,
2401 first_elt_done,
2402 &bound_checked_to);
2403 }
2404 }
2405
2406 Trace successor_trace(*trace);
2407 successor_trace.set_at_start(false);
2408 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
2409 RecursionCheck rc(compiler);
2410 on_success()->Emit(compiler, &successor_trace);
2411}
2412
2413
2414void Trace::InvalidateCurrentCharacter() {
2415 characters_preloaded_ = 0;
2416}
2417
2418
2419void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2420 ASSERT(by > 0);
2421 // We don't have an instruction for shifting the current character register
2422 // down or for using a shifted value for anything so lets just forget that
2423 // we preloaded any characters into it.
2424 characters_preloaded_ = 0;
2425 // Adjust the offsets of the quick check performed information. This
2426 // information is used to find out what we already determined about the
2427 // characters by means of mask and compare.
2428 quick_check_performed_.Advance(by, compiler->ascii());
2429 cp_offset_ += by;
2430 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2431 compiler->SetRegExpTooBig();
2432 cp_offset_ = 0;
2433 }
2434 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2435}
2436
2437
2438void TextNode::MakeCaseIndependent() {
2439 int element_count = elms_->length();
2440 for (int i = 0; i < element_count; i++) {
2441 TextElement elm = elms_->at(i);
2442 if (elm.type == TextElement::CHAR_CLASS) {
2443 RegExpCharacterClass* cc = elm.data.u_char_class;
2444 ZoneList<CharacterRange>* ranges = cc->ranges();
2445 int range_count = ranges->length();
2446 for (int i = 0; i < range_count; i++) {
2447 ranges->at(i).AddCaseEquivalents(ranges);
2448 }
2449 }
2450 }
2451}
2452
2453
2454int TextNode::GreedyLoopTextLength() {
2455 TextElement elm = elms_->at(elms_->length() - 1);
2456 if (elm.type == TextElement::CHAR_CLASS) {
2457 return elm.cp_offset + 1;
2458 } else {
2459 return elm.cp_offset + elm.data.u_atom->data().length();
2460 }
2461}
2462
2463
2464// Finds the fixed match length of a sequence of nodes that goes from
2465// this alternative and back to this choice node. If there are variable
2466// length nodes or other complications in the way then return a sentinel
2467// value indicating that a greedy loop cannot be constructed.
2468int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2469 int length = 0;
2470 RegExpNode* node = alternative->node();
2471 // Later we will generate code for all these text nodes using recursion
2472 // so we have to limit the max number.
2473 int recursion_depth = 0;
2474 while (node != this) {
2475 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2476 return kNodeIsTooComplexForGreedyLoops;
2477 }
2478 int node_length = node->GreedyLoopTextLength();
2479 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2480 return kNodeIsTooComplexForGreedyLoops;
2481 }
2482 length += node_length;
2483 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2484 node = seq_node->on_success();
2485 }
2486 return length;
2487}
2488
2489
2490void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2491 ASSERT_EQ(loop_node_, NULL);
2492 AddAlternative(alt);
2493 loop_node_ = alt.node();
2494}
2495
2496
2497void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2498 ASSERT_EQ(continue_node_, NULL);
2499 AddAlternative(alt);
2500 continue_node_ = alt.node();
2501}
2502
2503
2504void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2505 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2506 if (trace->stop_node() == this) {
2507 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2508 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2509 // Update the counter-based backtracking info on the stack. This is an
2510 // optimization for greedy loops (see below).
2511 ASSERT(trace->cp_offset() == text_length);
2512 macro_assembler->AdvanceCurrentPosition(text_length);
2513 macro_assembler->GoTo(trace->loop_label());
2514 return;
2515 }
2516 ASSERT(trace->stop_node() == NULL);
2517 if (!trace->is_trivial()) {
2518 trace->Flush(compiler, this);
2519 return;
2520 }
2521 ChoiceNode::Emit(compiler, trace);
2522}
2523
2524
2525int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
2526 int preload_characters = EatsAtLeast(4, 0);
2527 if (compiler->macro_assembler()->CanReadUnaligned()) {
2528 bool ascii = compiler->ascii();
2529 if (ascii) {
2530 if (preload_characters > 4) preload_characters = 4;
2531 // We can't preload 3 characters because there is no machine instruction
2532 // to do that. We can't just load 4 because we could be reading
2533 // beyond the end of the string, which could cause a memory fault.
2534 if (preload_characters == 3) preload_characters = 2;
2535 } else {
2536 if (preload_characters > 2) preload_characters = 2;
2537 }
2538 } else {
2539 if (preload_characters > 1) preload_characters = 1;
2540 }
2541 return preload_characters;
2542}
2543
2544
2545// This class is used when generating the alternatives in a choice node. It
2546// records the way the alternative is being code generated.
2547class AlternativeGeneration: public Malloced {
2548 public:
2549 AlternativeGeneration()
2550 : possible_success(),
2551 expects_preload(false),
2552 after(),
2553 quick_check_details() { }
2554 Label possible_success;
2555 bool expects_preload;
2556 Label after;
2557 QuickCheckDetails quick_check_details;
2558};
2559
2560
2561// Creates a list of AlternativeGenerations. If the list has a reasonable
2562// size then it is on the stack, otherwise the excess is on the heap.
2563class AlternativeGenerationList {
2564 public:
2565 explicit AlternativeGenerationList(int count)
2566 : alt_gens_(count) {
2567 for (int i = 0; i < count && i < kAFew; i++) {
2568 alt_gens_.Add(a_few_alt_gens_ + i);
2569 }
2570 for (int i = kAFew; i < count; i++) {
2571 alt_gens_.Add(new AlternativeGeneration());
2572 }
2573 }
2574 ~AlternativeGenerationList() {
2575 for (int i = kAFew; i < alt_gens_.length(); i++) {
2576 delete alt_gens_[i];
2577 alt_gens_[i] = NULL;
2578 }
2579 }
2580
2581 AlternativeGeneration* at(int i) {
2582 return alt_gens_[i];
2583 }
2584 private:
2585 static const int kAFew = 10;
2586 ZoneList<AlternativeGeneration*> alt_gens_;
2587 AlternativeGeneration a_few_alt_gens_[kAFew];
2588};
2589
2590
2591/* Code generation for choice nodes.
2592 *
2593 * We generate quick checks that do a mask and compare to eliminate a
2594 * choice. If the quick check succeeds then it jumps to the continuation to
2595 * do slow checks and check subsequent nodes. If it fails (the common case)
2596 * it falls through to the next choice.
2597 *
2598 * Here is the desired flow graph. Nodes directly below each other imply
2599 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2600 * 3 doesn't have a quick check so we have to call the slow check.
2601 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2602 * regexp continuation is generated directly after the Sn node, up to the
2603 * next GoTo if we decide to reuse some already generated code. Some
2604 * nodes expect preload_characters to be preloaded into the current
2605 * character register. R nodes do this preloading. Vertices are marked
2606 * F for failures and S for success (possible success in the case of quick
2607 * nodes). L, V, < and > are used as arrow heads.
2608 *
2609 * ----------> R
2610 * |
2611 * V
2612 * Q1 -----> S1
2613 * | S /
2614 * F| /
2615 * | F/
2616 * | /
2617 * | R
2618 * | /
2619 * V L
2620 * Q2 -----> S2
2621 * | S /
2622 * F| /
2623 * | F/
2624 * | /
2625 * | R
2626 * | /
2627 * V L
2628 * S3
2629 * |
2630 * F|
2631 * |
2632 * R
2633 * |
2634 * backtrack V
2635 * <----------Q4
2636 * \ F |
2637 * \ |S
2638 * \ F V
2639 * \-----S4
2640 *
2641 * For greedy loops we reverse our expectation and expect to match rather
2642 * than fail. Therefore we want the loop code to look like this (U is the
2643 * unwind code that steps back in the greedy loop). The following alternatives
2644 * look the same as above.
2645 * _____
2646 * / \
2647 * V |
2648 * ----------> S1 |
2649 * /| |
2650 * / |S |
2651 * F/ \_____/
2652 * /
2653 * |<-----------
2654 * | \
2655 * V \
2656 * Q2 ---> S2 \
2657 * | S / |
2658 * F| / |
2659 * | F/ |
2660 * | / |
2661 * | R |
2662 * | / |
2663 * F VL |
2664 * <------U |
2665 * back |S |
2666 * \______________/
2667 */
2668
2669
2670void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2671 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2672 int choice_count = alternatives_->length();
2673#ifdef DEBUG
2674 for (int i = 0; i < choice_count - 1; i++) {
2675 GuardedAlternative alternative = alternatives_->at(i);
2676 ZoneList<Guard*>* guards = alternative.guards();
2677 int guard_count = (guards == NULL) ? 0 : guards->length();
2678 for (int j = 0; j < guard_count; j++) {
2679 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
2680 }
2681 }
2682#endif
2683
2684 LimitResult limit_result = LimitVersions(compiler, trace);
2685 if (limit_result == DONE) return;
2686 ASSERT(limit_result == CONTINUE);
2687
2688 int new_flush_budget = trace->flush_budget() / choice_count;
2689 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
2690 trace->Flush(compiler, this);
2691 return;
2692 }
2693
2694 RecursionCheck rc(compiler);
2695
2696 Trace* current_trace = trace;
2697
2698 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2699 bool greedy_loop = false;
2700 Label greedy_loop_label;
2701 Trace counter_backtrack_trace;
2702 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
2703 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
2704
2705 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2706 // Here we have special handling for greedy loops containing only text nodes
2707 // and other simple nodes. These are handled by pushing the current
2708 // position on the stack and then incrementing the current position each
2709 // time around the switch. On backtrack we decrement the current position
2710 // and check it against the pushed value. This avoids pushing backtrack
2711 // information for each iteration of the loop, which could take up a lot of
2712 // space.
2713 greedy_loop = true;
2714 ASSERT(trace->stop_node() == NULL);
2715 macro_assembler->PushCurrentPosition();
2716 current_trace = &counter_backtrack_trace;
2717 Label greedy_match_failed;
2718 Trace greedy_match_trace;
2719 if (not_at_start()) greedy_match_trace.set_at_start(false);
2720 greedy_match_trace.set_backtrack(&greedy_match_failed);
2721 Label loop_label;
2722 macro_assembler->Bind(&loop_label);
2723 greedy_match_trace.set_stop_node(this);
2724 greedy_match_trace.set_loop_label(&loop_label);
2725 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
2726 macro_assembler->Bind(&greedy_match_failed);
2727 }
2728
2729 Label second_choice; // For use in greedy matches.
2730 macro_assembler->Bind(&second_choice);
2731
2732 int first_normal_choice = greedy_loop ? 1 : 0;
2733
2734 int preload_characters = CalculatePreloadCharacters(compiler);
2735 bool preload_is_current =
2736 (current_trace->characters_preloaded() == preload_characters);
2737 bool preload_has_checked_bounds = preload_is_current;
2738
2739 AlternativeGenerationList alt_gens(choice_count);
2740
2741 // For now we just call all choices one after the other. The idea ultimately
2742 // is to use the Dispatch table to try only the relevant ones.
2743 for (int i = first_normal_choice; i < choice_count; i++) {
2744 GuardedAlternative alternative = alternatives_->at(i);
2745 AlternativeGeneration* alt_gen = alt_gens.at(i);
2746 alt_gen->quick_check_details.set_characters(preload_characters);
2747 ZoneList<Guard*>* guards = alternative.guards();
2748 int guard_count = (guards == NULL) ? 0 : guards->length();
2749 Trace new_trace(*current_trace);
2750 new_trace.set_characters_preloaded(preload_is_current ?
2751 preload_characters :
2752 0);
2753 if (preload_has_checked_bounds) {
2754 new_trace.set_bound_checked_up_to(preload_characters);
2755 }
2756 new_trace.quick_check_performed()->Clear();
2757 if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
2758 alt_gen->expects_preload = preload_is_current;
2759 bool generate_full_check_inline = false;
2760 if (FLAG_regexp_optimization &&
2761 try_to_emit_quick_check_for_alternative(i) &&
2762 alternative.node()->EmitQuickCheck(compiler,
2763 &new_trace,
2764 preload_has_checked_bounds,
2765 &alt_gen->possible_success,
2766 &alt_gen->quick_check_details,
2767 i < choice_count - 1)) {
2768 // Quick check was generated for this choice.
2769 preload_is_current = true;
2770 preload_has_checked_bounds = true;
2771 // On the last choice in the ChoiceNode we generated the quick
2772 // check to fall through on possible success. So now we need to
2773 // generate the full check inline.
2774 if (i == choice_count - 1) {
2775 macro_assembler->Bind(&alt_gen->possible_success);
2776 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2777 new_trace.set_characters_preloaded(preload_characters);
2778 new_trace.set_bound_checked_up_to(preload_characters);
2779 generate_full_check_inline = true;
2780 }
2781 } else if (alt_gen->quick_check_details.cannot_match()) {
2782 if (i == choice_count - 1 && !greedy_loop) {
2783 macro_assembler->GoTo(trace->backtrack());
2784 }
2785 continue;
2786 } else {
2787 // No quick check was generated. Put the full code here.
2788 // If this is not the first choice then there could be slow checks from
2789 // previous cases that go here when they fail. There's no reason to
2790 // insist that they preload characters since the slow check we are about
2791 // to generate probably can't use it.
2792 if (i != first_normal_choice) {
2793 alt_gen->expects_preload = false;
2794 new_trace.set_characters_preloaded(0);
2795 }
2796 if (i < choice_count - 1) {
2797 new_trace.set_backtrack(&alt_gen->after);
2798 }
2799 generate_full_check_inline = true;
2800 }
2801 if (generate_full_check_inline) {
2802 if (new_trace.actions() != NULL) {
2803 new_trace.set_flush_budget(new_flush_budget);
2804 }
2805 for (int j = 0; j < guard_count; j++) {
2806 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
2807 }
2808 alternative.node()->Emit(compiler, &new_trace);
2809 preload_is_current = false;
2810 }
2811 macro_assembler->Bind(&alt_gen->after);
2812 }
2813 if (greedy_loop) {
2814 macro_assembler->Bind(&greedy_loop_label);
2815 // If we have unwound to the bottom then backtrack.
2816 macro_assembler->CheckGreedyLoop(trace->backtrack());
2817 // Otherwise try the second priority at an earlier position.
2818 macro_assembler->AdvanceCurrentPosition(-text_length);
2819 macro_assembler->GoTo(&second_choice);
2820 }
2821
2822 // At this point we need to generate slow checks for the alternatives where
2823 // the quick check was inlined. We can recognize these because the associated
2824 // label was bound.
2825 for (int i = first_normal_choice; i < choice_count - 1; i++) {
2826 AlternativeGeneration* alt_gen = alt_gens.at(i);
2827 Trace new_trace(*current_trace);
2828 // If there are actions to be flushed we have to limit how many times
2829 // they are flushed. Take the budget of the parent trace and distribute
2830 // it fairly amongst the children.
2831 if (new_trace.actions() != NULL) {
2832 new_trace.set_flush_budget(new_flush_budget);
2833 }
2834 EmitOutOfLineContinuation(compiler,
2835 &new_trace,
2836 alternatives_->at(i),
2837 alt_gen,
2838 preload_characters,
2839 alt_gens.at(i + 1)->expects_preload);
2840 }
2841}
2842
2843
2844void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
2845 Trace* trace,
2846 GuardedAlternative alternative,
2847 AlternativeGeneration* alt_gen,
2848 int preload_characters,
2849 bool next_expects_preload) {
2850 if (!alt_gen->possible_success.is_linked()) return;
2851
2852 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2853 macro_assembler->Bind(&alt_gen->possible_success);
2854 Trace out_of_line_trace(*trace);
2855 out_of_line_trace.set_characters_preloaded(preload_characters);
2856 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2857 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
2858 ZoneList<Guard*>* guards = alternative.guards();
2859 int guard_count = (guards == NULL) ? 0 : guards->length();
2860 if (next_expects_preload) {
2861 Label reload_current_char;
2862 out_of_line_trace.set_backtrack(&reload_current_char);
2863 for (int j = 0; j < guard_count; j++) {
2864 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
2865 }
2866 alternative.node()->Emit(compiler, &out_of_line_trace);
2867 macro_assembler->Bind(&reload_current_char);
2868 // Reload the current character, since the next quick check expects that.
2869 // We don't need to check bounds here because we only get into this
2870 // code through a quick check which already did the checked load.
2871 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
2872 NULL,
2873 false,
2874 preload_characters);
2875 macro_assembler->GoTo(&(alt_gen->after));
2876 } else {
2877 out_of_line_trace.set_backtrack(&(alt_gen->after));
2878 for (int j = 0; j < guard_count; j++) {
2879 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
2880 }
2881 alternative.node()->Emit(compiler, &out_of_line_trace);
2882 }
2883}
2884
2885
2886void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2887 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2888 LimitResult limit_result = LimitVersions(compiler, trace);
2889 if (limit_result == DONE) return;
2890 ASSERT(limit_result == CONTINUE);
2891
2892 RecursionCheck rc(compiler);
2893
2894 switch (type_) {
2895 case STORE_POSITION: {
2896 Trace::DeferredCapture
2897 new_capture(data_.u_position_register.reg,
2898 data_.u_position_register.is_capture,
2899 trace);
2900 Trace new_trace = *trace;
2901 new_trace.add_action(&new_capture);
2902 on_success()->Emit(compiler, &new_trace);
2903 break;
2904 }
2905 case INCREMENT_REGISTER: {
2906 Trace::DeferredIncrementRegister
2907 new_increment(data_.u_increment_register.reg);
2908 Trace new_trace = *trace;
2909 new_trace.add_action(&new_increment);
2910 on_success()->Emit(compiler, &new_trace);
2911 break;
2912 }
2913 case SET_REGISTER: {
2914 Trace::DeferredSetRegister
2915 new_set(data_.u_store_register.reg, data_.u_store_register.value);
2916 Trace new_trace = *trace;
2917 new_trace.add_action(&new_set);
2918 on_success()->Emit(compiler, &new_trace);
2919 break;
2920 }
2921 case CLEAR_CAPTURES: {
2922 Trace::DeferredClearCaptures
2923 new_capture(Interval(data_.u_clear_captures.range_from,
2924 data_.u_clear_captures.range_to));
2925 Trace new_trace = *trace;
2926 new_trace.add_action(&new_capture);
2927 on_success()->Emit(compiler, &new_trace);
2928 break;
2929 }
2930 case BEGIN_SUBMATCH:
2931 if (!trace->is_trivial()) {
2932 trace->Flush(compiler, this);
2933 } else {
2934 assembler->WriteCurrentPositionToRegister(
2935 data_.u_submatch.current_position_register, 0);
2936 assembler->WriteStackPointerToRegister(
2937 data_.u_submatch.stack_pointer_register);
2938 on_success()->Emit(compiler, trace);
2939 }
2940 break;
2941 case EMPTY_MATCH_CHECK: {
2942 int start_pos_reg = data_.u_empty_match_check.start_register;
2943 int stored_pos = 0;
2944 int rep_reg = data_.u_empty_match_check.repetition_register;
2945 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
2946 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
2947 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
2948 // If we know we haven't advanced and there is no minimum we
2949 // can just backtrack immediately.
2950 assembler->GoTo(trace->backtrack());
2951 } else if (know_dist && stored_pos < trace->cp_offset()) {
2952 // If we know we've advanced we can generate the continuation
2953 // immediately.
2954 on_success()->Emit(compiler, trace);
2955 } else if (!trace->is_trivial()) {
2956 trace->Flush(compiler, this);
2957 } else {
2958 Label skip_empty_check;
2959 // If we have a minimum number of repetitions we check the current
2960 // number first and skip the empty check if it's not enough.
2961 if (has_minimum) {
2962 int limit = data_.u_empty_match_check.repetition_limit;
2963 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
2964 }
2965 // If the match is empty we bail out, otherwise we fall through
2966 // to the on-success continuation.
2967 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
2968 trace->backtrack());
2969 assembler->Bind(&skip_empty_check);
2970 on_success()->Emit(compiler, trace);
2971 }
2972 break;
2973 }
2974 case POSITIVE_SUBMATCH_SUCCESS: {
2975 if (!trace->is_trivial()) {
2976 trace->Flush(compiler, this);
2977 return;
2978 }
2979 assembler->ReadCurrentPositionFromRegister(
2980 data_.u_submatch.current_position_register);
2981 assembler->ReadStackPointerFromRegister(
2982 data_.u_submatch.stack_pointer_register);
2983 int clear_register_count = data_.u_submatch.clear_register_count;
2984 if (clear_register_count == 0) {
2985 on_success()->Emit(compiler, trace);
2986 return;
2987 }
2988 int clear_registers_from = data_.u_submatch.clear_register_from;
2989 Label clear_registers_backtrack;
2990 Trace new_trace = *trace;
2991 new_trace.set_backtrack(&clear_registers_backtrack);
2992 on_success()->Emit(compiler, &new_trace);
2993
2994 assembler->Bind(&clear_registers_backtrack);
2995 int clear_registers_to = clear_registers_from + clear_register_count - 1;
2996 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
2997
2998 ASSERT(trace->backtrack() == NULL);
2999 assembler->Backtrack();
3000 return;
3001 }
3002 default:
3003 UNREACHABLE();
3004 }
3005}
3006
3007
3008void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3009 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3010 if (!trace->is_trivial()) {
3011 trace->Flush(compiler, this);
3012 return;
3013 }
3014
3015 LimitResult limit_result = LimitVersions(compiler, trace);
3016 if (limit_result == DONE) return;
3017 ASSERT(limit_result == CONTINUE);
3018
3019 RecursionCheck rc(compiler);
3020
3021 ASSERT_EQ(start_reg_ + 1, end_reg_);
3022 if (compiler->ignore_case()) {
3023 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3024 trace->backtrack());
3025 } else {
3026 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
3027 }
3028 on_success()->Emit(compiler, trace);
3029}
3030
3031
3032// -------------------------------------------------------------------
3033// Dot/dotty output
3034
3035
3036#ifdef DEBUG
3037
3038
3039class DotPrinter: public NodeVisitor {
3040 public:
3041 explicit DotPrinter(bool ignore_case)
3042 : ignore_case_(ignore_case),
3043 stream_(&alloc_) { }
3044 void PrintNode(const char* label, RegExpNode* node);
3045 void Visit(RegExpNode* node);
3046 void PrintAttributes(RegExpNode* from);
3047 StringStream* stream() { return &stream_; }
3048 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3049#define DECLARE_VISIT(Type) \
3050 virtual void Visit##Type(Type##Node* that);
3051FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3052#undef DECLARE_VISIT
3053 private:
3054 bool ignore_case_;
3055 HeapStringAllocator alloc_;
3056 StringStream stream_;
3057};
3058
3059
3060void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3061 stream()->Add("digraph G {\n graph [label=\"");
3062 for (int i = 0; label[i]; i++) {
3063 switch (label[i]) {
3064 case '\\':
3065 stream()->Add("\\\\");
3066 break;
3067 case '"':
3068 stream()->Add("\"");
3069 break;
3070 default:
3071 stream()->Put(label[i]);
3072 break;
3073 }
3074 }
3075 stream()->Add("\"];\n");
3076 Visit(node);
3077 stream()->Add("}\n");
3078 printf("%s", *(stream()->ToCString()));
3079}
3080
3081
3082void DotPrinter::Visit(RegExpNode* node) {
3083 if (node->info()->visited) return;
3084 node->info()->visited = true;
3085 node->Accept(this);
3086}
3087
3088
3089void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3090 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3091 Visit(on_failure);
3092}
3093
3094
3095class TableEntryBodyPrinter {
3096 public:
3097 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3098 : stream_(stream), choice_(choice) { }
3099 void Call(uc16 from, DispatchTable::Entry entry) {
3100 OutSet* out_set = entry.out_set();
3101 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3102 if (out_set->Get(i)) {
3103 stream()->Add(" n%p:s%io%i -> n%p;\n",
3104 choice(),
3105 from,
3106 i,
3107 choice()->alternatives()->at(i).node());
3108 }
3109 }
3110 }
3111 private:
3112 StringStream* stream() { return stream_; }
3113 ChoiceNode* choice() { return choice_; }
3114 StringStream* stream_;
3115 ChoiceNode* choice_;
3116};
3117
3118
3119class TableEntryHeaderPrinter {
3120 public:
3121 explicit TableEntryHeaderPrinter(StringStream* stream)
3122 : first_(true), stream_(stream) { }
3123 void Call(uc16 from, DispatchTable::Entry entry) {
3124 if (first_) {
3125 first_ = false;
3126 } else {
3127 stream()->Add("|");
3128 }
3129 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3130 OutSet* out_set = entry.out_set();
3131 int priority = 0;
3132 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3133 if (out_set->Get(i)) {
3134 if (priority > 0) stream()->Add("|");
3135 stream()->Add("<s%io%i> %i", from, i, priority);
3136 priority++;
3137 }
3138 }
3139 stream()->Add("}}");
3140 }
3141 private:
3142 bool first_;
3143 StringStream* stream() { return stream_; }
3144 StringStream* stream_;
3145};
3146
3147
3148class AttributePrinter {
3149 public:
3150 explicit AttributePrinter(DotPrinter* out)
3151 : out_(out), first_(true) { }
3152 void PrintSeparator() {
3153 if (first_) {
3154 first_ = false;
3155 } else {
3156 out_->stream()->Add("|");
3157 }
3158 }
3159 void PrintBit(const char* name, bool value) {
3160 if (!value) return;
3161 PrintSeparator();
3162 out_->stream()->Add("{%s}", name);
3163 }
3164 void PrintPositive(const char* name, int value) {
3165 if (value < 0) return;
3166 PrintSeparator();
3167 out_->stream()->Add("{%s|%x}", name, value);
3168 }
3169 private:
3170 DotPrinter* out_;
3171 bool first_;
3172};
3173
3174
3175void DotPrinter::PrintAttributes(RegExpNode* that) {
3176 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3177 "margin=0.1, fontsize=10, label=\"{",
3178 that);
3179 AttributePrinter printer(this);
3180 NodeInfo* info = that->info();
3181 printer.PrintBit("NI", info->follows_newline_interest);
3182 printer.PrintBit("WI", info->follows_word_interest);
3183 printer.PrintBit("SI", info->follows_start_interest);
3184 Label* label = that->label();
3185 if (label->is_bound())
3186 printer.PrintPositive("@", label->pos());
3187 stream()->Add("}\"];\n");
3188 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3189 "arrowhead=none];\n", that, that);
3190}
3191
3192
3193static const bool kPrintDispatchTable = false;
3194void DotPrinter::VisitChoice(ChoiceNode* that) {
3195 if (kPrintDispatchTable) {
3196 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3197 TableEntryHeaderPrinter header_printer(stream());
3198 that->GetTable(ignore_case_)->ForEach(&header_printer);
3199 stream()->Add("\"]\n", that);
3200 PrintAttributes(that);
3201 TableEntryBodyPrinter body_printer(stream(), that);
3202 that->GetTable(ignore_case_)->ForEach(&body_printer);
3203 } else {
3204 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3205 for (int i = 0; i < that->alternatives()->length(); i++) {
3206 GuardedAlternative alt = that->alternatives()->at(i);
3207 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3208 }
3209 }
3210 for (int i = 0; i < that->alternatives()->length(); i++) {
3211 GuardedAlternative alt = that->alternatives()->at(i);
3212 alt.node()->Accept(this);
3213 }
3214}
3215
3216
3217void DotPrinter::VisitText(TextNode* that) {
3218 stream()->Add(" n%p [label=\"", that);
3219 for (int i = 0; i < that->elements()->length(); i++) {
3220 if (i > 0) stream()->Add(" ");
3221 TextElement elm = that->elements()->at(i);
3222 switch (elm.type) {
3223 case TextElement::ATOM: {
3224 stream()->Add("'%w'", elm.data.u_atom->data());
3225 break;
3226 }
3227 case TextElement::CHAR_CLASS: {
3228 RegExpCharacterClass* node = elm.data.u_char_class;
3229 stream()->Add("[");
3230 if (node->is_negated())
3231 stream()->Add("^");
3232 for (int j = 0; j < node->ranges()->length(); j++) {
3233 CharacterRange range = node->ranges()->at(j);
3234 stream()->Add("%k-%k", range.from(), range.to());
3235 }
3236 stream()->Add("]");
3237 break;
3238 }
3239 default:
3240 UNREACHABLE();
3241 }
3242 }
3243 stream()->Add("\", shape=box, peripheries=2];\n");
3244 PrintAttributes(that);
3245 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3246 Visit(that->on_success());
3247}
3248
3249
3250void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3251 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3252 that,
3253 that->start_register(),
3254 that->end_register());
3255 PrintAttributes(that);
3256 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3257 Visit(that->on_success());
3258}
3259
3260
3261void DotPrinter::VisitEnd(EndNode* that) {
3262 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3263 PrintAttributes(that);
3264}
3265
3266
3267void DotPrinter::VisitAssertion(AssertionNode* that) {
3268 stream()->Add(" n%p [", that);
3269 switch (that->type()) {
3270 case AssertionNode::AT_END:
3271 stream()->Add("label=\"$\", shape=septagon");
3272 break;
3273 case AssertionNode::AT_START:
3274 stream()->Add("label=\"^\", shape=septagon");
3275 break;
3276 case AssertionNode::AT_BOUNDARY:
3277 stream()->Add("label=\"\\b\", shape=septagon");
3278 break;
3279 case AssertionNode::AT_NON_BOUNDARY:
3280 stream()->Add("label=\"\\B\", shape=septagon");
3281 break;
3282 case AssertionNode::AFTER_NEWLINE:
3283 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3284 break;
3285 }
3286 stream()->Add("];\n");
3287 PrintAttributes(that);
3288 RegExpNode* successor = that->on_success();
3289 stream()->Add(" n%p -> n%p;\n", that, successor);
3290 Visit(successor);
3291}
3292
3293
3294void DotPrinter::VisitAction(ActionNode* that) {
3295 stream()->Add(" n%p [", that);
3296 switch (that->type_) {
3297 case ActionNode::SET_REGISTER:
3298 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3299 that->data_.u_store_register.reg,
3300 that->data_.u_store_register.value);
3301 break;
3302 case ActionNode::INCREMENT_REGISTER:
3303 stream()->Add("label=\"$%i++\", shape=octagon",
3304 that->data_.u_increment_register.reg);
3305 break;
3306 case ActionNode::STORE_POSITION:
3307 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3308 that->data_.u_position_register.reg);
3309 break;
3310 case ActionNode::BEGIN_SUBMATCH:
3311 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3312 that->data_.u_submatch.current_position_register);
3313 break;
3314 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3315 stream()->Add("label=\"escape\", shape=septagon");
3316 break;
3317 case ActionNode::EMPTY_MATCH_CHECK:
3318 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3319 that->data_.u_empty_match_check.start_register,
3320 that->data_.u_empty_match_check.repetition_register,
3321 that->data_.u_empty_match_check.repetition_limit);
3322 break;
3323 case ActionNode::CLEAR_CAPTURES: {
3324 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3325 that->data_.u_clear_captures.range_from,
3326 that->data_.u_clear_captures.range_to);
3327 break;
3328 }
3329 }
3330 stream()->Add("];\n");
3331 PrintAttributes(that);
3332 RegExpNode* successor = that->on_success();
3333 stream()->Add(" n%p -> n%p;\n", that, successor);
3334 Visit(successor);
3335}
3336
3337
3338class DispatchTableDumper {
3339 public:
3340 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3341 void Call(uc16 key, DispatchTable::Entry entry);
3342 StringStream* stream() { return stream_; }
3343 private:
3344 StringStream* stream_;
3345};
3346
3347
3348void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3349 stream()->Add("[%k-%k]: {", key, entry.to());
3350 OutSet* set = entry.out_set();
3351 bool first = true;
3352 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3353 if (set->Get(i)) {
3354 if (first) {
3355 first = false;
3356 } else {
3357 stream()->Add(", ");
3358 }
3359 stream()->Add("%i", i);
3360 }
3361 }
3362 stream()->Add("}\n");
3363}
3364
3365
3366void DispatchTable::Dump() {
3367 HeapStringAllocator alloc;
3368 StringStream stream(&alloc);
3369 DispatchTableDumper dumper(&stream);
3370 tree()->ForEach(&dumper);
3371 OS::PrintError("%s", *stream.ToCString());
3372}
3373
3374
3375void RegExpEngine::DotPrint(const char* label,
3376 RegExpNode* node,
3377 bool ignore_case) {
3378 DotPrinter printer(ignore_case);
3379 printer.PrintNode(label, node);
3380}
3381
3382
3383#endif // DEBUG
3384
3385
3386// -------------------------------------------------------------------
3387// Tree to graph conversion
3388
3389static const int kSpaceRangeCount = 20;
3390static const int kSpaceRangeAsciiCount = 4;
3391static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3392 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3393 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3394
3395static const int kWordRangeCount = 8;
3396static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3397 '_', 'a', 'z' };
3398
3399static const int kDigitRangeCount = 2;
3400static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3401
3402static const int kLineTerminatorRangeCount = 6;
3403static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3404 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
3405
3406RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3407 RegExpNode* on_success) {
3408 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3409 elms->Add(TextElement::Atom(this));
3410 return new TextNode(elms, on_success);
3411}
3412
3413
3414RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3415 RegExpNode* on_success) {
3416 return new TextNode(elements(), on_success);
3417}
3418
3419static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3420 const uc16* special_class,
3421 int length) {
3422 ASSERT(ranges->length() != 0);
3423 ASSERT(length != 0);
3424 ASSERT(special_class[0] != 0);
3425 if (ranges->length() != (length >> 1) + 1) {
3426 return false;
3427 }
3428 CharacterRange range = ranges->at(0);
3429 if (range.from() != 0) {
3430 return false;
3431 }
3432 for (int i = 0; i < length; i += 2) {
3433 if (special_class[i] != (range.to() + 1)) {
3434 return false;
3435 }
3436 range = ranges->at((i >> 1) + 1);
3437 if (special_class[i+1] != range.from() - 1) {
3438 return false;
3439 }
3440 }
3441 if (range.to() != 0xffff) {
3442 return false;
3443 }
3444 return true;
3445}
3446
3447
3448static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3449 const uc16* special_class,
3450 int length) {
3451 if (ranges->length() * 2 != length) {
3452 return false;
3453 }
3454 for (int i = 0; i < length; i += 2) {
3455 CharacterRange range = ranges->at(i >> 1);
3456 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3457 return false;
3458 }
3459 }
3460 return true;
3461}
3462
3463
3464bool RegExpCharacterClass::is_standard() {
3465 // TODO(lrn): Remove need for this function, by not throwing away information
3466 // along the way.
3467 if (is_negated_) {
3468 return false;
3469 }
3470 if (set_.is_standard()) {
3471 return true;
3472 }
3473 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3474 set_.set_standard_set_type('s');
3475 return true;
3476 }
3477 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3478 set_.set_standard_set_type('S');
3479 return true;
3480 }
3481 if (CompareInverseRanges(set_.ranges(),
3482 kLineTerminatorRanges,
3483 kLineTerminatorRangeCount)) {
3484 set_.set_standard_set_type('.');
3485 return true;
3486 }
3487 return false;
3488}
3489
3490
3491RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
3492 RegExpNode* on_success) {
3493 return new TextNode(this, on_success);
3494}
3495
3496
3497RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
3498 RegExpNode* on_success) {
3499 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3500 int length = alternatives->length();
3501 ChoiceNode* result = new ChoiceNode(length);
3502 for (int i = 0; i < length; i++) {
3503 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
3504 on_success));
3505 result->AddAlternative(alternative);
3506 }
3507 return result;
3508}
3509
3510
3511RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
3512 RegExpNode* on_success) {
3513 return ToNode(min(),
3514 max(),
3515 is_greedy(),
3516 body(),
3517 compiler,
3518 on_success);
3519}
3520
3521
3522RegExpNode* RegExpQuantifier::ToNode(int min,
3523 int max,
3524 bool is_greedy,
3525 RegExpTree* body,
3526 RegExpCompiler* compiler,
3527 RegExpNode* on_success,
3528 bool not_at_start) {
3529 // x{f, t} becomes this:
3530 //
3531 // (r++)<-.
3532 // | `
3533 // | (x)
3534 // v ^
3535 // (r=0)-->(?)---/ [if r < t]
3536 // |
3537 // [if r >= f] \----> ...
3538 //
3539
3540 // 15.10.2.5 RepeatMatcher algorithm.
3541 // The parser has already eliminated the case where max is 0. In the case
3542 // where max_match is zero the parser has removed the quantifier if min was
3543 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3544
3545 // If we know that we cannot match zero length then things are a little
3546 // simpler since we don't need to make the special zero length match check
3547 // from step 2.1. If the min and max are small we can unroll a little in
3548 // this case.
3549 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3550 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3551 if (max == 0) return on_success; // This can happen due to recursion.
3552 bool body_can_be_empty = (body->min_match() == 0);
3553 int body_start_reg = RegExpCompiler::kNoRegister;
3554 Interval capture_registers = body->CaptureRegisters();
3555 bool needs_capture_clearing = !capture_registers.is_empty();
3556 if (body_can_be_empty) {
3557 body_start_reg = compiler->AllocateRegister();
3558 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
3559 // Only unroll if there are no captures and the body can't be
3560 // empty.
3561 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3562 int new_max = (max == kInfinity) ? max : max - min;
3563 // Recurse once to get the loop or optional matches after the fixed ones.
3564 RegExpNode* answer = ToNode(
3565 0, new_max, is_greedy, body, compiler, on_success, true);
3566 // Unroll the forced matches from 0 to min. This can cause chains of
3567 // TextNodes (which the parser does not generate). These should be
3568 // combined if it turns out they hinder good code generation.
3569 for (int i = 0; i < min; i++) {
3570 answer = body->ToNode(compiler, answer);
3571 }
3572 return answer;
3573 }
3574 if (max <= kMaxUnrolledMaxMatches) {
3575 ASSERT(min == 0);
3576 // Unroll the optional matches up to max.
3577 RegExpNode* answer = on_success;
3578 for (int i = 0; i < max; i++) {
3579 ChoiceNode* alternation = new ChoiceNode(2);
3580 if (is_greedy) {
3581 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3582 answer)));
3583 alternation->AddAlternative(GuardedAlternative(on_success));
3584 } else {
3585 alternation->AddAlternative(GuardedAlternative(on_success));
3586 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3587 answer)));
3588 }
3589 answer = alternation;
3590 if (not_at_start) alternation->set_not_at_start();
3591 }
3592 return answer;
3593 }
3594 }
3595 bool has_min = min > 0;
3596 bool has_max = max < RegExpTree::kInfinity;
3597 bool needs_counter = has_min || has_max;
3598 int reg_ctr = needs_counter
3599 ? compiler->AllocateRegister()
3600 : RegExpCompiler::kNoRegister;
3601 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
3602 if (not_at_start) center->set_not_at_start();
3603 RegExpNode* loop_return = needs_counter
3604 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3605 : static_cast<RegExpNode*>(center);
3606 if (body_can_be_empty) {
3607 // If the body can be empty we need to check if it was and then
3608 // backtrack.
3609 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3610 reg_ctr,
3611 min,
3612 loop_return);
3613 }
3614 RegExpNode* body_node = body->ToNode(compiler, loop_return);
3615 if (body_can_be_empty) {
3616 // If the body can be empty we need to store the start position
3617 // so we can bail out if it was empty.
3618 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
3619 }
3620 if (needs_capture_clearing) {
3621 // Before entering the body of this loop we need to clear captures.
3622 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3623 }
3624 GuardedAlternative body_alt(body_node);
3625 if (has_max) {
3626 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3627 body_alt.AddGuard(body_guard);
3628 }
3629 GuardedAlternative rest_alt(on_success);
3630 if (has_min) {
3631 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3632 rest_alt.AddGuard(rest_guard);
3633 }
3634 if (is_greedy) {
3635 center->AddLoopAlternative(body_alt);
3636 center->AddContinueAlternative(rest_alt);
3637 } else {
3638 center->AddContinueAlternative(rest_alt);
3639 center->AddLoopAlternative(body_alt);
3640 }
3641 if (needs_counter) {
3642 return ActionNode::SetRegister(reg_ctr, 0, center);
3643 } else {
3644 return center;
3645 }
3646}
3647
3648
3649RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
3650 RegExpNode* on_success) {
3651 NodeInfo info;
3652 switch (type()) {
3653 case START_OF_LINE:
3654 return AssertionNode::AfterNewline(on_success);
3655 case START_OF_INPUT:
3656 return AssertionNode::AtStart(on_success);
3657 case BOUNDARY:
3658 return AssertionNode::AtBoundary(on_success);
3659 case NON_BOUNDARY:
3660 return AssertionNode::AtNonBoundary(on_success);
3661 case END_OF_INPUT:
3662 return AssertionNode::AtEnd(on_success);
3663 case END_OF_LINE: {
3664 // Compile $ in multiline regexps as an alternation with a positive
3665 // lookahead in one side and an end-of-input on the other side.
3666 // We need two registers for the lookahead.
3667 int stack_pointer_register = compiler->AllocateRegister();
3668 int position_register = compiler->AllocateRegister();
3669 // The ChoiceNode to distinguish between a newline and end-of-input.
3670 ChoiceNode* result = new ChoiceNode(2);
3671 // Create a newline atom.
3672 ZoneList<CharacterRange>* newline_ranges =
3673 new ZoneList<CharacterRange>(3);
3674 CharacterRange::AddClassEscape('n', newline_ranges);
3675 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
3676 TextNode* newline_matcher = new TextNode(
3677 newline_atom,
3678 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3679 position_register,
3680 0, // No captures inside.
3681 -1, // Ignored if no captures.
3682 on_success));
3683 // Create an end-of-input matcher.
3684 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
3685 stack_pointer_register,
3686 position_register,
3687 newline_matcher);
3688 // Add the two alternatives to the ChoiceNode.
3689 GuardedAlternative eol_alternative(end_of_line);
3690 result->AddAlternative(eol_alternative);
3691 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
3692 result->AddAlternative(end_alternative);
3693 return result;
3694 }
3695 default:
3696 UNREACHABLE();
3697 }
3698 return on_success;
3699}
3700
3701
3702RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
3703 RegExpNode* on_success) {
3704 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3705 RegExpCapture::EndRegister(index()),
3706 on_success);
3707}
3708
3709
3710RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
3711 RegExpNode* on_success) {
3712 return on_success;
3713}
3714
3715
3716RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
3717 RegExpNode* on_success) {
3718 int stack_pointer_register = compiler->AllocateRegister();
3719 int position_register = compiler->AllocateRegister();
3720
3721 const int registers_per_capture = 2;
3722 const int register_of_first_capture = 2;
3723 int register_count = capture_count_ * registers_per_capture;
3724 int register_start =
3725 register_of_first_capture + capture_from_ * registers_per_capture;
3726
3727 RegExpNode* success;
3728 if (is_positive()) {
3729 RegExpNode* node = ActionNode::BeginSubmatch(
3730 stack_pointer_register,
3731 position_register,
3732 body()->ToNode(
3733 compiler,
3734 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3735 position_register,
3736 register_count,
3737 register_start,
3738 on_success)));
3739 return node;
3740 } else {
3741 // We use a ChoiceNode for a negative lookahead because it has most of
3742 // the characteristics we need. It has the body of the lookahead as its
3743 // first alternative and the expression after the lookahead of the second
3744 // alternative. If the first alternative succeeds then the
3745 // NegativeSubmatchSuccess will unwind the stack including everything the
3746 // choice node set up and backtrack. If the first alternative fails then
3747 // the second alternative is tried, which is exactly the desired result
3748 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
3749 // ChoiceNode that knows to ignore the first exit when calculating quick
3750 // checks.
3751 GuardedAlternative body_alt(
3752 body()->ToNode(
3753 compiler,
3754 success = new NegativeSubmatchSuccess(stack_pointer_register,
3755 position_register,
3756 register_count,
3757 register_start)));
3758 ChoiceNode* choice_node =
3759 new NegativeLookaheadChoiceNode(body_alt,
3760 GuardedAlternative(on_success));
3761 return ActionNode::BeginSubmatch(stack_pointer_register,
3762 position_register,
3763 choice_node);
3764 }
3765}
3766
3767
3768RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
3769 RegExpNode* on_success) {
3770 return ToNode(body(), index(), compiler, on_success);
3771}
3772
3773
3774RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3775 int index,
3776 RegExpCompiler* compiler,
3777 RegExpNode* on_success) {
3778 int start_reg = RegExpCapture::StartRegister(index);
3779 int end_reg = RegExpCapture::EndRegister(index);
3780 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
3781 RegExpNode* body_node = body->ToNode(compiler, store_end);
3782 return ActionNode::StorePosition(start_reg, true, body_node);
3783}
3784
3785
3786RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
3787 RegExpNode* on_success) {
3788 ZoneList<RegExpTree*>* children = nodes();
3789 RegExpNode* current = on_success;
3790 for (int i = children->length() - 1; i >= 0; i--) {
3791 current = children->at(i)->ToNode(compiler, current);
3792 }
3793 return current;
3794}
3795
3796
3797static void AddClass(const uc16* elmv,
3798 int elmc,
3799 ZoneList<CharacterRange>* ranges) {
3800 for (int i = 0; i < elmc; i += 2) {
3801 ASSERT(elmv[i] <= elmv[i + 1]);
3802 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3803 }
3804}
3805
3806
3807static void AddClassNegated(const uc16 *elmv,
3808 int elmc,
3809 ZoneList<CharacterRange>* ranges) {
3810 ASSERT(elmv[0] != 0x0000);
3811 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
3812 uc16 last = 0x0000;
3813 for (int i = 0; i < elmc; i += 2) {
3814 ASSERT(last <= elmv[i] - 1);
3815 ASSERT(elmv[i] <= elmv[i + 1]);
3816 ranges->Add(CharacterRange(last, elmv[i] - 1));
3817 last = elmv[i + 1] + 1;
3818 }
3819 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
3820}
3821
3822
3823void CharacterRange::AddClassEscape(uc16 type,
3824 ZoneList<CharacterRange>* ranges) {
3825 switch (type) {
3826 case 's':
3827 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3828 break;
3829 case 'S':
3830 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3831 break;
3832 case 'w':
3833 AddClass(kWordRanges, kWordRangeCount, ranges);
3834 break;
3835 case 'W':
3836 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3837 break;
3838 case 'd':
3839 AddClass(kDigitRanges, kDigitRangeCount, ranges);
3840 break;
3841 case 'D':
3842 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3843 break;
3844 case '.':
3845 AddClassNegated(kLineTerminatorRanges,
3846 kLineTerminatorRangeCount,
3847 ranges);
3848 break;
3849 // This is not a character range as defined by the spec but a
3850 // convenient shorthand for a character class that matches any
3851 // character.
3852 case '*':
3853 ranges->Add(CharacterRange::Everything());
3854 break;
3855 // This is the set of characters matched by the $ and ^ symbols
3856 // in multiline mode.
3857 case 'n':
3858 AddClass(kLineTerminatorRanges,
3859 kLineTerminatorRangeCount,
3860 ranges);
3861 break;
3862 default:
3863 UNREACHABLE();
3864 }
3865}
3866
3867
3868Vector<const uc16> CharacterRange::GetWordBounds() {
3869 return Vector<const uc16>(kWordRanges, kWordRangeCount);
3870}
3871
3872
3873class CharacterRangeSplitter {
3874 public:
3875 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
3876 ZoneList<CharacterRange>** excluded)
3877 : included_(included),
3878 excluded_(excluded) { }
3879 void Call(uc16 from, DispatchTable::Entry entry);
3880
3881 static const int kInBase = 0;
3882 static const int kInOverlay = 1;
3883
3884 private:
3885 ZoneList<CharacterRange>** included_;
3886 ZoneList<CharacterRange>** excluded_;
3887};
3888
3889
3890void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
3891 if (!entry.out_set()->Get(kInBase)) return;
3892 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
3893 ? included_
3894 : excluded_;
3895 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
3896 (*target)->Add(CharacterRange(entry.from(), entry.to()));
3897}
3898
3899
3900void CharacterRange::Split(ZoneList<CharacterRange>* base,
3901 Vector<const uc16> overlay,
3902 ZoneList<CharacterRange>** included,
3903 ZoneList<CharacterRange>** excluded) {
3904 ASSERT_EQ(NULL, *included);
3905 ASSERT_EQ(NULL, *excluded);
3906 DispatchTable table;
3907 for (int i = 0; i < base->length(); i++)
3908 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3909 for (int i = 0; i < overlay.length(); i += 2) {
3910 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3911 CharacterRangeSplitter::kInOverlay);
3912 }
3913 CharacterRangeSplitter callback(included, excluded);
3914 table.ForEach(&callback);
3915}
3916
3917
3918void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
3919 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3920 if (IsSingleton()) {
3921 // If this is a singleton we just expand the one character.
3922 int length = uncanonicalize.get(from(), '\0', chars);
3923 for (int i = 0; i < length; i++) {
3924 uc32 chr = chars[i];
3925 if (chr != from()) {
3926 ranges->Add(CharacterRange::Singleton(chars[i]));
3927 }
3928 }
3929 } else if (from() <= kRangeCanonicalizeMax &&
3930 to() <= kRangeCanonicalizeMax) {
3931 // If this is a range we expand the characters block by block,
3932 // expanding contiguous subranges (blocks) one at a time.
3933 // The approach is as follows. For a given start character we
3934 // look up the block that contains it, for instance 'a' if the
3935 // start character is 'c'. A block is characterized by the property
3936 // that all characters uncanonicalize in the same way as the first
3937 // element, except that each entry in the result is incremented
3938 // by the distance from the first element. So a-z is a block
3939 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3940 // uncanonicalizes to ['a' + k, 'A' + k].
3941 // Once we've found the start point we look up its uncanonicalization
3942 // and produce a range for each element. For instance for [c-f]
3943 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
3944 // add a range if it is not already contained in the input, so [c-f]
3945 // will be skipped but [C-F] will be added. If this range is not
3946 // completely contained in a block we do this for all the blocks
3947 // covered by the range.
3948 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3949 // First, look up the block that contains the 'from' character.
3950 int length = canonrange.get(from(), '\0', range);
3951 if (length == 0) {
3952 range[0] = from();
3953 } else {
3954 ASSERT_EQ(1, length);
3955 }
3956 int pos = from();
3957 // The start of the current block. Note that except for the first
3958 // iteration 'start' is always equal to 'pos'.
3959 int start;
3960 // If it is not the start point of a block the entry contains the
3961 // offset of the character from the start point.
3962 if ((range[0] & kStartMarker) == 0) {
3963 start = pos - range[0];
3964 } else {
3965 start = pos;
3966 }
3967 // Then we add the ranges on at a time, incrementing the current
3968 // position to be after the last block each time. The position
3969 // always points to the start of a block.
3970 while (pos < to()) {
3971 length = canonrange.get(start, '\0', range);
3972 if (length == 0) {
3973 range[0] = start;
3974 } else {
3975 ASSERT_EQ(1, length);
3976 }
3977 ASSERT((range[0] & kStartMarker) != 0);
3978 // The start point of a block contains the distance to the end
3979 // of the range.
3980 int block_end = start + (range[0] & kPayloadMask) - 1;
3981 int end = (block_end > to()) ? to() : block_end;
3982 length = uncanonicalize.get(start, '\0', range);
3983 for (int i = 0; i < length; i++) {
3984 uc32 c = range[i];
3985 uc16 range_from = c + (pos - start);
3986 uc16 range_to = c + (end - start);
3987 if (!(from() <= range_from && range_to <= to())) {
3988 ranges->Add(CharacterRange(range_from, range_to));
3989 }
3990 }
3991 start = pos = block_end + 1;
3992 }
3993 } else {
3994 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
3995 }
3996}
3997
3998
3999ZoneList<CharacterRange>* CharacterSet::ranges() {
4000 if (ranges_ == NULL) {
4001 ranges_ = new ZoneList<CharacterRange>(2);
4002 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4003 }
4004 return ranges_;
4005}
4006
4007
4008
4009// -------------------------------------------------------------------
4010// Interest propagation
4011
4012
4013RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4014 for (int i = 0; i < siblings_.length(); i++) {
4015 RegExpNode* sibling = siblings_.Get(i);
4016 if (sibling->info()->Matches(info))
4017 return sibling;
4018 }
4019 return NULL;
4020}
4021
4022
4023RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4024 ASSERT_EQ(false, *cloned);
4025 siblings_.Ensure(this);
4026 RegExpNode* result = TryGetSibling(info);
4027 if (result != NULL) return result;
4028 result = this->Clone();
4029 NodeInfo* new_info = result->info();
4030 new_info->ResetCompilationState();
4031 new_info->AddFromPreceding(info);
4032 AddSibling(result);
4033 *cloned = true;
4034 return result;
4035}
4036
4037
4038template <class C>
4039static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4040 NodeInfo full_info(*node->info());
4041 full_info.AddFromPreceding(info);
4042 bool cloned = false;
4043 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4044}
4045
4046
4047// -------------------------------------------------------------------
4048// Splay tree
4049
4050
4051OutSet* OutSet::Extend(unsigned value) {
4052 if (Get(value))
4053 return this;
4054 if (successors() != NULL) {
4055 for (int i = 0; i < successors()->length(); i++) {
4056 OutSet* successor = successors()->at(i);
4057 if (successor->Get(value))
4058 return successor;
4059 }
4060 } else {
4061 successors_ = new ZoneList<OutSet*>(2);
4062 }
4063 OutSet* result = new OutSet(first_, remaining_);
4064 result->Set(value);
4065 successors()->Add(result);
4066 return result;
4067}
4068
4069
4070void OutSet::Set(unsigned value) {
4071 if (value < kFirstLimit) {
4072 first_ |= (1 << value);
4073 } else {
4074 if (remaining_ == NULL)
4075 remaining_ = new ZoneList<unsigned>(1);
4076 if (remaining_->is_empty() || !remaining_->Contains(value))
4077 remaining_->Add(value);
4078 }
4079}
4080
4081
4082bool OutSet::Get(unsigned value) {
4083 if (value < kFirstLimit) {
4084 return (first_ & (1 << value)) != 0;
4085 } else if (remaining_ == NULL) {
4086 return false;
4087 } else {
4088 return remaining_->Contains(value);
4089 }
4090}
4091
4092
4093const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4094const DispatchTable::Entry DispatchTable::Config::kNoValue;
4095
4096
4097void DispatchTable::AddRange(CharacterRange full_range, int value) {
4098 CharacterRange current = full_range;
4099 if (tree()->is_empty()) {
4100 // If this is the first range we just insert into the table.
4101 ZoneSplayTree<Config>::Locator loc;
4102 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4103 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4104 return;
4105 }
4106 // First see if there is a range to the left of this one that
4107 // overlaps.
4108 ZoneSplayTree<Config>::Locator loc;
4109 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4110 Entry* entry = &loc.value();
4111 // If we've found a range that overlaps with this one, and it
4112 // starts strictly to the left of this one, we have to fix it
4113 // because the following code only handles ranges that start on
4114 // or after the start point of the range we're adding.
4115 if (entry->from() < current.from() && entry->to() >= current.from()) {
4116 // Snap the overlapping range in half around the start point of
4117 // the range we're adding.
4118 CharacterRange left(entry->from(), current.from() - 1);
4119 CharacterRange right(current.from(), entry->to());
4120 // The left part of the overlapping range doesn't overlap.
4121 // Truncate the whole entry to be just the left part.
4122 entry->set_to(left.to());
4123 // The right part is the one that overlaps. We add this part
4124 // to the map and let the next step deal with merging it with
4125 // the range we're adding.
4126 ZoneSplayTree<Config>::Locator loc;
4127 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4128 loc.set_value(Entry(right.from(),
4129 right.to(),
4130 entry->out_set()));
4131 }
4132 }
4133 while (current.is_valid()) {
4134 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4135 (loc.value().from() <= current.to()) &&
4136 (loc.value().to() >= current.from())) {
4137 Entry* entry = &loc.value();
4138 // We have overlap. If there is space between the start point of
4139 // the range we're adding and where the overlapping range starts
4140 // then we have to add a range covering just that space.
4141 if (current.from() < entry->from()) {
4142 ZoneSplayTree<Config>::Locator ins;
4143 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4144 ins.set_value(Entry(current.from(),
4145 entry->from() - 1,
4146 empty()->Extend(value)));
4147 current.set_from(entry->from());
4148 }
4149 ASSERT_EQ(current.from(), entry->from());
4150 // If the overlapping range extends beyond the one we want to add
4151 // we have to snap the right part off and add it separately.
4152 if (entry->to() > current.to()) {
4153 ZoneSplayTree<Config>::Locator ins;
4154 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4155 ins.set_value(Entry(current.to() + 1,
4156 entry->to(),
4157 entry->out_set()));
4158 entry->set_to(current.to());
4159 }
4160 ASSERT(entry->to() <= current.to());
4161 // The overlapping range is now completely contained by the range
4162 // we're adding so we can just update it and move the start point
4163 // of the range we're adding just past it.
4164 entry->AddValue(value);
4165 // Bail out if the last interval ended at 0xFFFF since otherwise
4166 // adding 1 will wrap around to 0.
4167 if (entry->to() == String::kMaxUC16CharCode)
4168 break;
4169 ASSERT(entry->to() + 1 > current.from());
4170 current.set_from(entry->to() + 1);
4171 } else {
4172 // There is no overlap so we can just add the range
4173 ZoneSplayTree<Config>::Locator ins;
4174 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4175 ins.set_value(Entry(current.from(),
4176 current.to(),
4177 empty()->Extend(value)));
4178 break;
4179 }
4180 }
4181}
4182
4183
4184OutSet* DispatchTable::Get(uc16 value) {
4185 ZoneSplayTree<Config>::Locator loc;
4186 if (!tree()->FindGreatestLessThan(value, &loc))
4187 return empty();
4188 Entry* entry = &loc.value();
4189 if (value <= entry->to())
4190 return entry->out_set();
4191 else
4192 return empty();
4193}
4194
4195
4196// -------------------------------------------------------------------
4197// Analysis
4198
4199
4200void Analysis::EnsureAnalyzed(RegExpNode* that) {
4201 StackLimitCheck check;
4202 if (check.HasOverflowed()) {
4203 fail("Stack overflow");
4204 return;
4205 }
4206 if (that->info()->been_analyzed || that->info()->being_analyzed)
4207 return;
4208 that->info()->being_analyzed = true;
4209 that->Accept(this);
4210 that->info()->being_analyzed = false;
4211 that->info()->been_analyzed = true;
4212}
4213
4214
4215void Analysis::VisitEnd(EndNode* that) {
4216 // nothing to do
4217}
4218
4219
4220void TextNode::CalculateOffsets() {
4221 int element_count = elements()->length();
4222 // Set up the offsets of the elements relative to the start. This is a fixed
4223 // quantity since a TextNode can only contain fixed-width things.
4224 int cp_offset = 0;
4225 for (int i = 0; i < element_count; i++) {
4226 TextElement& elm = elements()->at(i);
4227 elm.cp_offset = cp_offset;
4228 if (elm.type == TextElement::ATOM) {
4229 cp_offset += elm.data.u_atom->data().length();
4230 } else {
4231 cp_offset++;
4232 Vector<const uc16> quarks = elm.data.u_atom->data();
4233 }
4234 }
4235}
4236
4237
4238void Analysis::VisitText(TextNode* that) {
4239 if (ignore_case_) {
4240 that->MakeCaseIndependent();
4241 }
4242 EnsureAnalyzed(that->on_success());
4243 if (!has_failed()) {
4244 that->CalculateOffsets();
4245 }
4246}
4247
4248
4249void Analysis::VisitAction(ActionNode* that) {
4250 RegExpNode* target = that->on_success();
4251 EnsureAnalyzed(target);
4252 if (!has_failed()) {
4253 // If the next node is interested in what it follows then this node
4254 // has to be interested too so it can pass the information on.
4255 that->info()->AddFromFollowing(target->info());
4256 }
4257}
4258
4259
4260void Analysis::VisitChoice(ChoiceNode* that) {
4261 NodeInfo* info = that->info();
4262 for (int i = 0; i < that->alternatives()->length(); i++) {
4263 RegExpNode* node = that->alternatives()->at(i).node();
4264 EnsureAnalyzed(node);
4265 if (has_failed()) return;
4266 // Anything the following nodes need to know has to be known by
4267 // this node also, so it can pass it on.
4268 info->AddFromFollowing(node->info());
4269 }
4270}
4271
4272
4273void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4274 NodeInfo* info = that->info();
4275 for (int i = 0; i < that->alternatives()->length(); i++) {
4276 RegExpNode* node = that->alternatives()->at(i).node();
4277 if (node != that->loop_node()) {
4278 EnsureAnalyzed(node);
4279 if (has_failed()) return;
4280 info->AddFromFollowing(node->info());
4281 }
4282 }
4283 // Check the loop last since it may need the value of this node
4284 // to get a correct result.
4285 EnsureAnalyzed(that->loop_node());
4286 if (!has_failed()) {
4287 info->AddFromFollowing(that->loop_node()->info());
4288 }
4289}
4290
4291
4292void Analysis::VisitBackReference(BackReferenceNode* that) {
4293 EnsureAnalyzed(that->on_success());
4294}
4295
4296
4297void Analysis::VisitAssertion(AssertionNode* that) {
4298 EnsureAnalyzed(that->on_success());
4299}
4300
4301
4302// -------------------------------------------------------------------
4303// Dispatch table construction
4304
4305
4306void DispatchTableConstructor::VisitEnd(EndNode* that) {
4307 AddRange(CharacterRange::Everything());
4308}
4309
4310
4311void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4312 node->set_being_calculated(true);
4313 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4314 for (int i = 0; i < alternatives->length(); i++) {
4315 set_choice_index(i);
4316 alternatives->at(i).node()->Accept(this);
4317 }
4318 node->set_being_calculated(false);
4319}
4320
4321
4322class AddDispatchRange {
4323 public:
4324 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4325 : constructor_(constructor) { }
4326 void Call(uc32 from, DispatchTable::Entry entry);
4327 private:
4328 DispatchTableConstructor* constructor_;
4329};
4330
4331
4332void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4333 CharacterRange range(from, entry.to());
4334 constructor_->AddRange(range);
4335}
4336
4337
4338void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4339 if (node->being_calculated())
4340 return;
4341 DispatchTable* table = node->GetTable(ignore_case_);
4342 AddDispatchRange adder(this);
4343 table->ForEach(&adder);
4344}
4345
4346
4347void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4348 // TODO(160): Find the node that we refer back to and propagate its start
4349 // set back to here. For now we just accept anything.
4350 AddRange(CharacterRange::Everything());
4351}
4352
4353
4354void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4355 RegExpNode* target = that->on_success();
4356 target->Accept(this);
4357}
4358
4359
4360
4361static int CompareRangeByFrom(const CharacterRange* a,
4362 const CharacterRange* b) {
4363 return Compare<uc16>(a->from(), b->from());
4364}
4365
4366
4367void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4368 ranges->Sort(CompareRangeByFrom);
4369 uc16 last = 0;
4370 for (int i = 0; i < ranges->length(); i++) {
4371 CharacterRange range = ranges->at(i);
4372 if (last < range.from())
4373 AddRange(CharacterRange(last, range.from() - 1));
4374 if (range.to() >= last) {
4375 if (range.to() == String::kMaxUC16CharCode) {
4376 return;
4377 } else {
4378 last = range.to() + 1;
4379 }
4380 }
4381 }
4382 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
4383}
4384
4385
4386void DispatchTableConstructor::VisitText(TextNode* that) {
4387 TextElement elm = that->elements()->at(0);
4388 switch (elm.type) {
4389 case TextElement::ATOM: {
4390 uc16 c = elm.data.u_atom->data()[0];
4391 AddRange(CharacterRange(c, c));
4392 break;
4393 }
4394 case TextElement::CHAR_CLASS: {
4395 RegExpCharacterClass* tree = elm.data.u_char_class;
4396 ZoneList<CharacterRange>* ranges = tree->ranges();
4397 if (tree->is_negated()) {
4398 AddInverse(ranges);
4399 } else {
4400 for (int i = 0; i < ranges->length(); i++)
4401 AddRange(ranges->at(i));
4402 }
4403 break;
4404 }
4405 default: {
4406 UNIMPLEMENTED();
4407 }
4408 }
4409}
4410
4411
4412void DispatchTableConstructor::VisitAction(ActionNode* that) {
4413 RegExpNode* target = that->on_success();
4414 target->Accept(this);
4415}
4416
4417
4418RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
4419 bool ignore_case,
4420 bool is_multiline,
4421 Handle<String> pattern,
4422 bool is_ascii) {
4423 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
4424 return IrregexpRegExpTooBig();
4425 }
4426 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
4427 // Wrap the body of the regexp in capture #0.
4428 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
4429 0,
4430 &compiler,
4431 compiler.accept());
4432 RegExpNode* node = captured_body;
4433 if (!data->tree->IsAnchored()) {
4434 // Add a .*? at the beginning, outside the body capture, unless
4435 // this expression is anchored at the beginning.
4436 RegExpNode* loop_node =
4437 RegExpQuantifier::ToNode(0,
4438 RegExpTree::kInfinity,
4439 false,
4440 new RegExpCharacterClass('*'),
4441 &compiler,
4442 captured_body,
4443 data->contains_anchor);
4444
4445 if (data->contains_anchor) {
4446 // Unroll loop once, to take care of the case that might start
4447 // at the start of input.
4448 ChoiceNode* first_step_node = new ChoiceNode(2);
4449 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4450 first_step_node->AddAlternative(GuardedAlternative(
4451 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4452 node = first_step_node;
4453 } else {
4454 node = loop_node;
4455 }
4456 }
4457 data->node = node;
4458 Analysis analysis(ignore_case);
4459 analysis.EnsureAnalyzed(node);
4460 if (analysis.has_failed()) {
4461 const char* error_message = analysis.error_message();
4462 return CompilationResult(error_message);
4463 }
4464
4465 NodeInfo info = *node->info();
4466
4467 // Create the correct assembler for the architecture.
4468#ifdef V8_NATIVE_REGEXP
4469 // Native regexp implementation.
4470
4471 NativeRegExpMacroAssembler::Mode mode =
4472 is_ascii ? NativeRegExpMacroAssembler::ASCII
4473 : NativeRegExpMacroAssembler::UC16;
4474
4475#if V8_TARGET_ARCH_IA32
4476 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
4477#elif V8_TARGET_ARCH_X64
4478 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
4479#elif V8_TARGET_ARCH_ARM
4480 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
4481#endif
4482
4483#else // ! V8_NATIVE_REGEXP
4484 // Interpreted regexp implementation.
4485 EmbeddedVector<byte, 1024> codes;
4486 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4487#endif
4488
4489 return compiler.Assemble(&macro_assembler,
4490 node,
4491 data->capture_count,
4492 pattern);
4493}
4494
4495}} // namespace v8::internal