blob: bf3bdb777f126e99dd42c170bf673030100fa423 [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_JSREGEXP_H_
29#define V8_JSREGEXP_H_
30
31namespace v8 { namespace internal {
32
ager@chromium.orga74f0da2008-12-03 16:05:52 +000033
34class RegExpMacroAssembler;
35
36
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000037class RegExpImpl {
38 public:
39 // Creates a regular expression literal in the old space.
40 // This function calls the garbage collector if necessary.
mads.s.ager@gmail.com9a4089a2008-09-01 08:55:01 +000041 static Handle<Object> CreateRegExpLiteral(Handle<JSFunction> constructor,
42 Handle<String> pattern,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000043 Handle<String> flags,
44 bool* has_pending_exception);
45
46 // Returns a string representation of a regular expression.
47 // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
48 // This function calls the garbage collector if necessary.
49 static Handle<String> ToString(Handle<Object> value);
50
ager@chromium.org8bb60582008-12-11 12:02:20 +000051 // Parses the RegExp pattern and prepares the JSRegExp object with
52 // generic data and choice of implementation - as well as what
53 // the implementation wants to store in the data field.
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000054 static Handle<Object> Compile(Handle<JSRegExp> re,
55 Handle<String> pattern,
56 Handle<String> flags);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000057
58 // Implements RegExp.prototype.exec(string) function.
59 // See ECMA-262 section 15.10.6.2.
60 // This function calls the garbage collector if necessary.
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000061 static Handle<Object> Exec(Handle<JSRegExp> regexp,
62 Handle<String> subject,
63 Handle<Object> index);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000064
65 // Call RegExp.prototyp.exec(string) in a loop.
66 // Used by String.prototype.match and String.prototype.replace.
67 // This function calls the garbage collector if necessary.
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000068 static Handle<Object> ExecGlobal(Handle<JSRegExp> regexp,
69 Handle<String> subject);
70
ager@chromium.orga74f0da2008-12-03 16:05:52 +000071 // Stores an uncompiled RegExp pattern in the JSRegExp object.
72 // It will be compiled by JSCRE when first executed.
73 static Handle<Object> JscrePrepare(Handle<JSRegExp> re,
74 Handle<String> pattern,
75 JSRegExp::Flags flags);
76
ager@chromium.org8bb60582008-12-11 12:02:20 +000077 // Prepares a JSRegExp object with Irregexp-specific data.
ager@chromium.orga74f0da2008-12-03 16:05:52 +000078 static Handle<Object> IrregexpPrepare(Handle<JSRegExp> re,
79 Handle<String> pattern,
ager@chromium.org8bb60582008-12-11 12:02:20 +000080 JSRegExp::Flags flags);
ager@chromium.orga74f0da2008-12-03 16:05:52 +000081
82
83 // Compile the pattern using JSCRE and store the result in the
84 // JSRegExp object.
85 static Handle<Object> JscreCompile(Handle<JSRegExp> re);
86
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000087 static Handle<Object> AtomCompile(Handle<JSRegExp> re,
kasperl@chromium.org9fe21c62008-10-28 08:53:51 +000088 Handle<String> pattern,
ager@chromium.orga74f0da2008-12-03 16:05:52 +000089 JSRegExp::Flags flags,
90 Handle<String> match_pattern);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +000091 static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
92 Handle<String> subject,
93 Handle<Object> index);
94
95 static Handle<Object> AtomExecGlobal(Handle<JSRegExp> regexp,
96 Handle<String> subject);
97
ager@chromium.orga74f0da2008-12-03 16:05:52 +000098 static Handle<Object> JscreCompile(Handle<JSRegExp> re,
99 Handle<String> pattern,
100 JSRegExp::Flags flags);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000101
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000102 // Execute a compiled JSCRE pattern.
103 static Handle<Object> JscreExec(Handle<JSRegExp> regexp,
104 Handle<String> subject,
105 Handle<Object> index);
kasperl@chromium.org41044eb2008-10-06 08:24:46 +0000106
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000107 // Execute an Irregexp bytecode pattern.
108 static Handle<Object> IrregexpExec(Handle<JSRegExp> regexp,
109 Handle<String> subject,
110 Handle<Object> index);
111
112 static Handle<Object> JscreExecGlobal(Handle<JSRegExp> regexp,
113 Handle<String> subject);
114
115 static Handle<Object> IrregexpExecGlobal(Handle<JSRegExp> regexp,
116 Handle<String> subject);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000117
118 static void NewSpaceCollectionPrologue();
119 static void OldSpaceCollectionPrologue();
120
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000121 // Converts a source string to a 16 bit flat string. The string
122 // will be either sequential or it will be a SlicedString backed
123 // by a flat string.
124 static Handle<String> StringToTwoByte(Handle<String> pattern);
125 static Handle<String> CachedStringToTwoByte(Handle<String> pattern);
126
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000127 static const int kIrregexpImplementationIndex = 0;
128 static const int kIrregexpNumberOfCapturesIndex = 1;
129 static const int kIrregexpNumberOfRegistersIndex = 2;
130 static const int kIrregexpCodeIndex = 3;
131 static const int kIrregexpDataLength = 4;
132
133 static const int kJscreNumberOfCapturesIndex = 0;
134 static const int kJscreInternalIndex = 1;
135 static const int kJscreDataLength = 2;
136
137 private:
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000138 static String* last_ascii_string_;
139 static String* two_byte_cached_string_;
140
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000141 static int JscreNumberOfCaptures(Handle<JSRegExp> re);
142 static ByteArray* JscreInternal(Handle<JSRegExp> re);
143
ager@chromium.org8bb60582008-12-11 12:02:20 +0000144 static int IrregexpNumberOfCaptures(Handle<FixedArray> re);
145 static int IrregexpNumberOfRegisters(Handle<FixedArray> re);
146 static Handle<ByteArray> IrregexpByteCode(Handle<FixedArray> re);
147 static Handle<Code> IrregexpNativeCode(Handle<FixedArray> re);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000148
149 // Call jsRegExpExecute once
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000150 static Handle<Object> JscreExecOnce(Handle<JSRegExp> regexp,
151 int num_captures,
152 Handle<String> subject,
153 int previous_index,
154 const uc16* utf8_subject,
155 int* ovector,
156 int ovector_length);
157
ager@chromium.org8bb60582008-12-11 12:02:20 +0000158 static Handle<Object> IrregexpExecOnce(Handle<FixedArray> regexp,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000159 int num_captures,
160 Handle<String> subject16,
161 int previous_index,
162 int* ovector,
163 int ovector_length);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000164
165 // Set the subject cache. The previous string buffer is not deleted, so the
166 // caller should ensure that it doesn't leak.
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000167 static void SetSubjectCache(String* subject,
168 char* utf8_subject,
169 int uft8_length,
170 int character_position,
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000171 int utf8_position);
172
173 // A one element cache of the last utf8_subject string and its length. The
174 // subject JS String object is cached in the heap. We also cache a
175 // translation between position and utf8 position.
176 static char* utf8_subject_cache_;
177 static int utf8_length_cache_;
178 static int utf8_position_;
179 static int character_position_;
180};
181
182
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000183class CharacterRange {
184 public:
185 CharacterRange() : from_(0), to_(0) { }
186 // For compatibility with the CHECK_OK macro
187 CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT
188 CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { }
189 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges);
190 static Vector<const uc16> GetWordBounds();
191 static inline CharacterRange Singleton(uc16 value) {
192 return CharacterRange(value, value);
193 }
194 static inline CharacterRange Range(uc16 from, uc16 to) {
195 ASSERT(from <= to);
196 return CharacterRange(from, to);
197 }
198 static inline CharacterRange Everything() {
199 return CharacterRange(0, 0xFFFF);
200 }
201 bool Contains(uc16 i) { return from_ <= i && i <= to_; }
202 uc16 from() const { return from_; }
203 void set_from(uc16 value) { from_ = value; }
204 uc16 to() const { return to_; }
205 void set_to(uc16 value) { to_ = value; }
206 bool is_valid() { return from_ <= to_; }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000207 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000208 bool IsSingleton() { return (from_ == to_); }
209 void AddCaseEquivalents(ZoneList<CharacterRange>* ranges);
210 static void Split(ZoneList<CharacterRange>* base,
211 Vector<const uc16> overlay,
212 ZoneList<CharacterRange>** included,
213 ZoneList<CharacterRange>** excluded);
214
215 static const int kRangeCanonicalizeMax = 0x346;
216 static const int kStartMarker = (1 << 24);
217 static const int kPayloadMask = (1 << 24) - 1;
218
219 private:
220 uc16 from_;
221 uc16 to_;
222};
223
224
225template <typename Node, class Callback>
226static void DoForEach(Node* node, Callback* callback);
227
228
229// A zone splay tree. The config type parameter encapsulates the
230// different configurations of a concrete splay tree:
231//
232// typedef Key: the key type
233// typedef Value: the value type
234// static const kNoKey: the dummy key used when no key is set
235// static const kNoValue: the dummy value used to initialize nodes
236// int (Compare)(Key& a, Key& b) -> {-1, 0, 1}: comparison function
237//
238template <typename Config>
239class ZoneSplayTree : public ZoneObject {
240 public:
241 typedef typename Config::Key Key;
242 typedef typename Config::Value Value;
243
244 class Locator;
245
246 ZoneSplayTree() : root_(NULL) { }
247
248 // Inserts the given key in this tree with the given value. Returns
249 // true if a node was inserted, otherwise false. If found the locator
250 // is enabled and provides access to the mapping for the key.
251 bool Insert(const Key& key, Locator* locator);
252
253 // Looks up the key in this tree and returns true if it was found,
254 // otherwise false. If the node is found the locator is enabled and
255 // provides access to the mapping for the key.
256 bool Find(const Key& key, Locator* locator);
257
258 // Finds the mapping with the greatest key less than or equal to the
259 // given key.
260 bool FindGreatestLessThan(const Key& key, Locator* locator);
261
262 // Find the mapping with the greatest key in this tree.
263 bool FindGreatest(Locator* locator);
264
265 // Finds the mapping with the least key greater than or equal to the
266 // given key.
267 bool FindLeastGreaterThan(const Key& key, Locator* locator);
268
269 // Find the mapping with the least key in this tree.
270 bool FindLeast(Locator* locator);
271
272 // Remove the node with the given key from the tree.
273 bool Remove(const Key& key);
274
275 bool is_empty() { return root_ == NULL; }
276
277 // Perform the splay operation for the given key. Moves the node with
278 // the given key to the top of the tree. If no node has the given
279 // key, the last node on the search path is moved to the top of the
280 // tree.
281 void Splay(const Key& key);
282
283 class Node : public ZoneObject {
284 public:
285 Node(const Key& key, const Value& value)
286 : key_(key),
287 value_(value),
288 left_(NULL),
289 right_(NULL) { }
290 Key key() { return key_; }
291 Value value() { return value_; }
292 Node* left() { return left_; }
293 Node* right() { return right_; }
294 private:
295 friend class ZoneSplayTree;
296 friend class Locator;
297 Key key_;
298 Value value_;
299 Node* left_;
300 Node* right_;
301 };
302
303 // A locator provides access to a node in the tree without actually
304 // exposing the node.
305 class Locator {
306 public:
307 explicit Locator(Node* node) : node_(node) { }
308 Locator() : node_(NULL) { }
309 const Key& key() { return node_->key_; }
310 Value& value() { return node_->value_; }
311 void set_value(const Value& value) { node_->value_ = value; }
312 inline void bind(Node* node) { node_ = node; }
313 private:
314 Node* node_;
315 };
316
317 template <class Callback>
318 void ForEach(Callback* c) {
319 DoForEach<typename ZoneSplayTree<Config>::Node, Callback>(root_, c);
320 }
321
322 private:
323 Node* root_;
324};
325
326
327// A set of unsigned integers that behaves especially well on small
328// integers (< 32). May do zone-allocation.
329class OutSet: public ZoneObject {
330 public:
331 OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
332 OutSet* Extend(unsigned value);
333 bool Get(unsigned value);
334 static const unsigned kFirstLimit = 32;
335
336 private:
337 // Destructively set a value in this set. In most cases you want
338 // to use Extend instead to ensure that only one instance exists
339 // that contains the same values.
340 void Set(unsigned value);
341
342 // The successors are a list of sets that contain the same values
343 // as this set and the one more value that is not present in this
344 // set.
345 ZoneList<OutSet*>* successors() { return successors_; }
346
347 OutSet(uint32_t first, ZoneList<unsigned>* remaining)
348 : first_(first), remaining_(remaining), successors_(NULL) { }
349 uint32_t first_;
350 ZoneList<unsigned>* remaining_;
351 ZoneList<OutSet*>* successors_;
ager@chromium.org32912102009-01-16 10:38:43 +0000352 friend class Trace;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000353};
354
355
356// A mapping from integers, specified as ranges, to a set of integers.
357// Used for mapping character ranges to choices.
358class DispatchTable : public ZoneObject {
359 public:
360 class Entry {
361 public:
362 Entry() : from_(0), to_(0), out_set_(NULL) { }
363 Entry(uc16 from, uc16 to, OutSet* out_set)
364 : from_(from), to_(to), out_set_(out_set) { }
365 uc16 from() { return from_; }
366 uc16 to() { return to_; }
367 void set_to(uc16 value) { to_ = value; }
368 void AddValue(int value) { out_set_ = out_set_->Extend(value); }
369 OutSet* out_set() { return out_set_; }
370 private:
371 uc16 from_;
372 uc16 to_;
373 OutSet* out_set_;
374 };
375
376 class Config {
377 public:
378 typedef uc16 Key;
379 typedef Entry Value;
380 static const uc16 kNoKey;
381 static const Entry kNoValue;
382 static inline int Compare(uc16 a, uc16 b) {
383 if (a == b)
384 return 0;
385 else if (a < b)
386 return -1;
387 else
388 return 1;
389 }
390 };
391
392 void AddRange(CharacterRange range, int value);
393 OutSet* Get(uc16 value);
394 void Dump();
395
396 template <typename Callback>
397 void ForEach(Callback* callback) { return tree()->ForEach(callback); }
398 private:
399 // There can't be a static empty set since it allocates its
400 // successors in a zone and caches them.
401 OutSet* empty() { return &empty_; }
402 OutSet empty_;
403 ZoneSplayTree<Config>* tree() { return &tree_; }
404 ZoneSplayTree<Config> tree_;
405};
406
407
408#define FOR_EACH_NODE_TYPE(VISIT) \
409 VISIT(End) \
410 VISIT(Action) \
411 VISIT(Choice) \
412 VISIT(BackReference) \
413 VISIT(Text)
414
415
416#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
417 VISIT(Disjunction) \
418 VISIT(Alternative) \
419 VISIT(Assertion) \
420 VISIT(CharacterClass) \
421 VISIT(Atom) \
422 VISIT(Quantifier) \
423 VISIT(Capture) \
424 VISIT(Lookahead) \
425 VISIT(BackReference) \
426 VISIT(Empty) \
427 VISIT(Text)
428
429
430#define FORWARD_DECLARE(Name) class RegExp##Name;
431FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
432#undef FORWARD_DECLARE
433
434
435class TextElement {
436 public:
437 enum Type {UNINITIALIZED, ATOM, CHAR_CLASS};
438 TextElement() : type(UNINITIALIZED) { }
ager@chromium.org8bb60582008-12-11 12:02:20 +0000439 explicit TextElement(Type t) : type(t), cp_offset(-1) { }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000440 static TextElement Atom(RegExpAtom* atom);
441 static TextElement CharClass(RegExpCharacterClass* char_class);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000442 int length();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000443 Type type;
444 union {
445 RegExpAtom* u_atom;
446 RegExpCharacterClass* u_char_class;
447 } data;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000448 int cp_offset;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000449};
450
451
ager@chromium.org32912102009-01-16 10:38:43 +0000452class Trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000453
454
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000455struct NodeInfo {
456 enum TriBool {
457 UNKNOWN = -1, FALSE = 0, TRUE = 1
458 };
459
460 NodeInfo()
461 : being_analyzed(false),
462 been_analyzed(false),
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000463 follows_word_interest(false),
464 follows_newline_interest(false),
465 follows_start_interest(false),
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000466 at_end(false),
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000467 visited(false) { }
468
469 // Returns true if the interests and assumptions of this node
470 // matches the given one.
471 bool Matches(NodeInfo* that) {
472 return (at_end == that->at_end) &&
473 (follows_word_interest == that->follows_word_interest) &&
474 (follows_newline_interest == that->follows_newline_interest) &&
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000475 (follows_start_interest == that->follows_start_interest);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000476 }
477
478 // Updates the interests of this node given the interests of the
479 // node preceding it.
480 void AddFromPreceding(NodeInfo* that) {
481 at_end |= that->at_end;
482 follows_word_interest |= that->follows_word_interest;
483 follows_newline_interest |= that->follows_newline_interest;
484 follows_start_interest |= that->follows_start_interest;
485 }
486
ager@chromium.org8bb60582008-12-11 12:02:20 +0000487 bool HasLookbehind() {
488 return follows_word_interest ||
489 follows_newline_interest ||
490 follows_start_interest;
491 }
492
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000493 // Sets the interests of this node to include the interests of the
494 // following node.
495 void AddFromFollowing(NodeInfo* that) {
496 follows_word_interest |= that->follows_word_interest;
497 follows_newline_interest |= that->follows_newline_interest;
498 follows_start_interest |= that->follows_start_interest;
499 }
500
501 void ResetCompilationState() {
502 being_analyzed = false;
503 been_analyzed = false;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000504 }
505
506 bool being_analyzed: 1;
507 bool been_analyzed: 1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000508
509 // These bits are set of this node has to know what the preceding
510 // character was.
511 bool follows_word_interest: 1;
512 bool follows_newline_interest: 1;
513 bool follows_start_interest: 1;
514
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000515 bool at_end: 1;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000516 bool visited: 1;
517};
518
519
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000520class SiblingList {
521 public:
522 SiblingList() : list_(NULL) { }
523 int length() {
524 return list_ == NULL ? 0 : list_->length();
525 }
526 void Ensure(RegExpNode* parent) {
527 if (list_ == NULL) {
528 list_ = new ZoneList<RegExpNode*>(2);
529 list_->Add(parent);
530 }
531 }
532 void Add(RegExpNode* node) { list_->Add(node); }
533 RegExpNode* Get(int index) { return list_->at(index); }
534 private:
535 ZoneList<RegExpNode*>* list_;
536};
537
538
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000539// Details of a quick mask-compare check that can look ahead in the
540// input stream.
541class QuickCheckDetails {
542 public:
543 QuickCheckDetails()
544 : characters_(0),
545 mask_(0),
546 value_(0) { }
547 explicit QuickCheckDetails(int characters)
548 : characters_(characters),
549 mask_(0),
550 value_(0) { }
551 bool Rationalize(bool ascii);
552 // Merge in the information from another branch of an alternation.
553 void Merge(QuickCheckDetails* other, int from_index);
554 // Advance the current position by some amount.
555 void Advance(int by, bool ascii);
556 void Clear();
557 struct Position {
558 Position() : mask(0), value(0), determines_perfectly(false) { }
559 uc16 mask;
560 uc16 value;
561 bool determines_perfectly;
562 };
563 int characters() { return characters_; }
564 void set_characters(int characters) { characters_ = characters; }
565 Position* positions(int index) {
566 ASSERT(index >= 0);
567 ASSERT(index < characters_);
568 return positions_ + index;
569 }
570 uint32_t mask() { return mask_; }
571 uint32_t value() { return value_; }
572
573 private:
574 // How many characters do we have quick check information from. This is
575 // the same for all branches of a choice node.
576 int characters_;
577 Position positions_[4];
578 // These values are the condensate of the above array after Rationalize().
579 uint32_t mask_;
580 uint32_t value_;
581};
582
583
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000584class RegExpNode: public ZoneObject {
585 public:
ager@chromium.org32912102009-01-16 10:38:43 +0000586 RegExpNode() : trace_count_(0) { }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000587 virtual ~RegExpNode();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000588 virtual void Accept(NodeVisitor* visitor) = 0;
589 // Generates a goto to this node or actually generates the code at this point.
590 // Until the implementation is complete we will return true for success and
591 // false for failure.
ager@chromium.org32912102009-01-16 10:38:43 +0000592 virtual bool Emit(RegExpCompiler* compiler, Trace* trace) = 0;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000593 // How many characters must this node consume at a minimum in order to
594 // succeed.
595 virtual int EatsAtLeast(int recursion_depth) = 0;
596 // Emits some quick code that checks whether the preloaded characters match.
597 // Falls through on certain failure, jumps to the label on possible success.
598 // If the node cannot make a quick check it does nothing and returns false.
599 bool EmitQuickCheck(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +0000600 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000601 bool preload_has_checked_bounds,
602 Label* on_possible_success,
603 QuickCheckDetails* details_return,
604 bool fall_through_on_failure);
605 // For a given number of characters this returns a mask and a value. The
606 // next n characters are anded with the mask and compared with the value.
607 // A comparison failure indicates the node cannot match the next n characters.
608 // A comparison success indicates the node may match.
609 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
610 RegExpCompiler* compiler,
611 int characters_filled_in) = 0;
ager@chromium.org8bb60582008-12-11 12:02:20 +0000612 static const int kNodeIsTooComplexForGreedyLoops = -1;
613 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
614 Label* label() { return &label_; }
ager@chromium.org32912102009-01-16 10:38:43 +0000615 // If non-generic code is generated for a node (ie the node is not at the
616 // start of the trace) then it cannot be reused. This variable sets a limit
617 // on how often we allow that to happen before we insist on starting a new
618 // trace and generating generic code for a node that can be reused by flushing
619 // the deferred actions in the current trace and generating a goto.
620 static const int kMaxCopiesCodeGenerated = 10;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000621
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000622 // Propagates the given interest information forward. When seeing
623 // \bfoo for instance, the \b is implemented by propagating forward
624 // to the 'foo' string that it should only succeed if its first
625 // character is a letter xor the previous character was a letter.
626 virtual RegExpNode* PropagateForward(NodeInfo* info) = 0;
627
628 NodeInfo* info() { return &info_; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000629
630 void AddSibling(RegExpNode* node) { siblings_.Add(node); }
631
632 // Static version of EnsureSibling that expresses the fact that the
633 // result has the same type as the input.
634 template <class C>
635 static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) {
636 return static_cast<C*>(node->EnsureSibling(info, cloned));
637 }
638
639 SiblingList* siblings() { return &siblings_; }
640 void set_siblings(SiblingList* other) { siblings_ = *other; }
641
642 protected:
ager@chromium.org8bb60582008-12-11 12:02:20 +0000643 enum LimitResult { DONE, FAIL, CONTINUE };
ager@chromium.org32912102009-01-16 10:38:43 +0000644 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000645
646 // Returns a sibling of this node whose interests and assumptions
647 // match the ones in the given node info. If no sibling exists NULL
648 // is returned.
649 RegExpNode* TryGetSibling(NodeInfo* info);
650
651 // Returns a sibling of this node whose interests match the ones in
652 // the given node info. The info must not contain any assertions.
653 // If no node exists a new one will be created by cloning the current
654 // node. The result will always be an instance of the same concrete
655 // class as this node.
656 RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned);
657
658 // Returns a clone of this node initialized using the copy constructor
659 // of its concrete class. Note that the node may have to be pre-
ager@chromium.org32912102009-01-16 10:38:43 +0000660 // processed before it is on a usable state.
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000661 virtual RegExpNode* Clone() = 0;
662
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000663 private:
664 Label label_;
665 NodeInfo info_;
666 SiblingList siblings_;
ager@chromium.org32912102009-01-16 10:38:43 +0000667 // This variable keeps track of how many times code has been generated for
668 // this node (in different traces). We don't keep track of where the
669 // generated code is located unless the code is generated at the start of
670 // a trace, in which case it is generic and can be reused by flushing the
671 // deferred operations in the current trace and generating a goto.
672 int trace_count_;
673};
674
675
676// A simple closed interval.
677class Interval {
678 public:
679 Interval() : from_(kNone), to_(kNone) { }
680 Interval(int from, int to) : from_(from), to_(to) { }
681 Interval Union(Interval that) {
682 if (that.from_ == kNone)
683 return *this;
684 else if (from_ == kNone)
685 return that;
686 else
687 return Interval(Min(from_, that.from_), Max(to_, that.to_));
688 }
689 bool Contains(int value) {
690 return (from_ <= value) && (value <= to_);
691 }
692 bool is_empty() { return from_ == kNone; }
693 int from() { return from_; }
694 int to() { return to_; }
695 static Interval Empty() { return Interval(); }
696 static const int kNone = -1;
697 private:
698 int from_;
699 int to_;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000700};
701
702
703class SeqRegExpNode: public RegExpNode {
704 public:
705 explicit SeqRegExpNode(RegExpNode* on_success)
706 : on_success_(on_success) { }
707 RegExpNode* on_success() { return on_success_; }
708 void set_on_success(RegExpNode* node) { on_success_ = node; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000709 private:
710 RegExpNode* on_success_;
711};
712
713
714class ActionNode: public SeqRegExpNode {
715 public:
716 enum Type {
ager@chromium.org8bb60582008-12-11 12:02:20 +0000717 SET_REGISTER,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000718 INCREMENT_REGISTER,
719 STORE_POSITION,
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000720 BEGIN_SUBMATCH,
ager@chromium.org32912102009-01-16 10:38:43 +0000721 POSITIVE_SUBMATCH_SUCCESS,
722 EMPTY_MATCH_CHECK,
723 CLEAR_CAPTURES
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000724 };
ager@chromium.org8bb60582008-12-11 12:02:20 +0000725 static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000726 static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
727 static ActionNode* StorePosition(int reg, RegExpNode* on_success);
ager@chromium.org32912102009-01-16 10:38:43 +0000728 static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
729 static ActionNode* BeginSubmatch(int stack_pointer_reg,
730 int position_reg,
731 RegExpNode* on_success);
732 static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
733 int restore_reg,
734 RegExpNode* on_success);
735 static ActionNode* EmptyMatchCheck(int start_register,
736 int repetition_register,
737 int repetition_limit,
738 RegExpNode* on_success);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000739 virtual void Accept(NodeVisitor* visitor);
ager@chromium.org32912102009-01-16 10:38:43 +0000740 virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000741 virtual int EatsAtLeast(int recursion_depth);
742 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
743 RegExpCompiler* compiler,
744 int filled_in) {
745 return on_success()->GetQuickCheckDetails(details, compiler, filled_in);
746 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000747 virtual RegExpNode* PropagateForward(NodeInfo* info);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000748 Type type() { return type_; }
749 // TODO(erikcorry): We should allow some action nodes in greedy loops.
750 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000751 virtual ActionNode* Clone() { return new ActionNode(*this); }
752
753 private:
754 union {
755 struct {
756 int reg;
757 int value;
758 } u_store_register;
759 struct {
760 int reg;
761 } u_increment_register;
762 struct {
763 int reg;
764 } u_position_register;
765 struct {
766 int stack_pointer_register;
767 int current_position_register;
768 } u_submatch;
ager@chromium.org32912102009-01-16 10:38:43 +0000769 struct {
770 int start_register;
771 int repetition_register;
772 int repetition_limit;
773 } u_empty_match_check;
774 struct {
775 int range_from;
776 int range_to;
777 } u_clear_captures;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000778 } data_;
779 ActionNode(Type type, RegExpNode* on_success)
780 : SeqRegExpNode(on_success),
781 type_(type) { }
782 Type type_;
783 friend class DotPrinter;
784};
785
786
787class TextNode: public SeqRegExpNode {
788 public:
789 TextNode(ZoneList<TextElement>* elms,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000790 RegExpNode* on_success)
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000791 : SeqRegExpNode(on_success),
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000792 elms_(elms) { }
793 TextNode(RegExpCharacterClass* that,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000794 RegExpNode* on_success)
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000795 : SeqRegExpNode(on_success),
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000796 elms_(new ZoneList<TextElement>(1)) {
797 elms_->Add(TextElement::CharClass(that));
798 }
799 virtual void Accept(NodeVisitor* visitor);
800 virtual RegExpNode* PropagateForward(NodeInfo* info);
ager@chromium.org32912102009-01-16 10:38:43 +0000801 virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000802 virtual int EatsAtLeast(int recursion_depth);
803 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
804 RegExpCompiler* compiler,
805 int characters_filled_in);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000806 ZoneList<TextElement>* elements() { return elms_; }
807 void MakeCaseIndependent();
ager@chromium.org8bb60582008-12-11 12:02:20 +0000808 virtual int GreedyLoopTextLength();
809 virtual TextNode* Clone() {
810 TextNode* result = new TextNode(*this);
811 result->CalculateOffsets();
812 return result;
813 }
814 void CalculateOffsets();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000815
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000816 private:
817 enum TextEmitPassType {
818 NON_ASCII_MATCH,
819 CHARACTER_MATCH,
820 CASE_CHARACTER_MATCH,
821 CHARACTER_CLASS_MATCH
822 };
823 void TextEmitPass(RegExpCompiler* compiler,
824 TextEmitPassType pass,
825 bool preloaded,
ager@chromium.org32912102009-01-16 10:38:43 +0000826 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000827 bool first_element_checked,
828 int* checked_up_to);
829 int Length();
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000830 ZoneList<TextElement>* elms_;
831};
832
833
834class BackReferenceNode: public SeqRegExpNode {
835 public:
836 BackReferenceNode(int start_reg,
837 int end_reg,
ager@chromium.org8bb60582008-12-11 12:02:20 +0000838 RegExpNode* on_success)
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000839 : SeqRegExpNode(on_success),
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000840 start_reg_(start_reg),
841 end_reg_(end_reg) { }
842 virtual void Accept(NodeVisitor* visitor);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000843 int start_register() { return start_reg_; }
844 int end_register() { return end_reg_; }
ager@chromium.org32912102009-01-16 10:38:43 +0000845 virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000846 virtual int EatsAtLeast(int recursion_depth) { return 0; }
847 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
848 RegExpCompiler* compiler,
849 int characters_filled_in) {
850 return;
851 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000852 virtual RegExpNode* PropagateForward(NodeInfo* info);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000853 virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
854
855 private:
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000856 int start_reg_;
857 int end_reg_;
858};
859
860
861class EndNode: public RegExpNode {
862 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +0000863 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000864 explicit EndNode(Action action) : action_(action) { }
865 virtual void Accept(NodeVisitor* visitor);
ager@chromium.org32912102009-01-16 10:38:43 +0000866 virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000867 virtual int EatsAtLeast(int recursion_depth) { return 0; }
868 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
869 RegExpCompiler* compiler,
870 int characters_filled_in) {
871 // Returning 0 from EatsAtLeast should ensure we never get here.
872 UNREACHABLE();
873 }
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000874 virtual RegExpNode* PropagateForward(NodeInfo* info);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000875 virtual EndNode* Clone() { return new EndNode(*this); }
876
ager@chromium.org8bb60582008-12-11 12:02:20 +0000877 protected:
ager@chromium.org32912102009-01-16 10:38:43 +0000878 void EmitInfoChecks(RegExpMacroAssembler* macro, Trace* trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000879
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000880 private:
881 Action action_;
882};
883
884
ager@chromium.org8bb60582008-12-11 12:02:20 +0000885class NegativeSubmatchSuccess: public EndNode {
886 public:
887 NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg)
888 : EndNode(NEGATIVE_SUBMATCH_SUCCESS),
889 stack_pointer_register_(stack_pointer_reg),
890 current_position_register_(position_reg) { }
ager@chromium.org32912102009-01-16 10:38:43 +0000891 virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000892
893 private:
894 int stack_pointer_register_;
895 int current_position_register_;
896};
897
898
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000899class Guard: public ZoneObject {
900 public:
901 enum Relation { LT, GEQ };
902 Guard(int reg, Relation op, int value)
903 : reg_(reg),
904 op_(op),
905 value_(value) { }
906 int reg() { return reg_; }
907 Relation op() { return op_; }
908 int value() { return value_; }
909
910 private:
911 int reg_;
912 Relation op_;
913 int value_;
914};
915
916
917class GuardedAlternative {
918 public:
919 explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { }
920 void AddGuard(Guard* guard);
921 RegExpNode* node() { return node_; }
922 void set_node(RegExpNode* node) { node_ = node; }
923 ZoneList<Guard*>* guards() { return guards_; }
924
925 private:
926 RegExpNode* node_;
927 ZoneList<Guard*>* guards_;
928};
929
930
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000931class AlternativeGeneration;
932
933
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000934class ChoiceNode: public RegExpNode {
935 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +0000936 explicit ChoiceNode(int expected_size)
937 : alternatives_(new ZoneList<GuardedAlternative>(expected_size)),
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000938 table_(NULL),
939 being_calculated_(false) { }
940 virtual void Accept(NodeVisitor* visitor);
941 void AddAlternative(GuardedAlternative node) { alternatives()->Add(node); }
942 ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
943 DispatchTable* GetTable(bool ignore_case);
ager@chromium.org32912102009-01-16 10:38:43 +0000944 virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000945 virtual int EatsAtLeast(int recursion_depth);
946 int EatsAtLeastHelper(int recursion_depth, RegExpNode* ignore_this_node);
947 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
948 RegExpCompiler* compiler,
949 int characters_filled_in);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000950 virtual RegExpNode* PropagateForward(NodeInfo* info);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000951 virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
952
953 bool being_calculated() { return being_calculated_; }
954 void set_being_calculated(bool b) { being_calculated_ = b; }
955
ager@chromium.org8bb60582008-12-11 12:02:20 +0000956 protected:
957 int GreedyLoopTextLength(GuardedAlternative *alternative);
958 ZoneList<GuardedAlternative>* alternatives_;
959
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000960 private:
961 friend class DispatchTableConstructor;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000962 friend class Analysis;
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000963 void GenerateGuard(RegExpMacroAssembler* macro_assembler,
964 Guard *guard,
ager@chromium.org32912102009-01-16 10:38:43 +0000965 Trace* trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000966 int CalculatePreloadCharacters(RegExpCompiler* compiler);
967 bool EmitOutOfLineContinuation(RegExpCompiler* compiler,
ager@chromium.org32912102009-01-16 10:38:43 +0000968 Trace* trace,
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000969 GuardedAlternative alternative,
970 AlternativeGeneration* alt_gen,
971 int preload_characters,
972 bool next_expects_preload);
ager@chromium.orga74f0da2008-12-03 16:05:52 +0000973 DispatchTable* table_;
974 bool being_calculated_;
975};
976
977
ager@chromium.org8bb60582008-12-11 12:02:20 +0000978class LoopChoiceNode: public ChoiceNode {
979 public:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000980 explicit LoopChoiceNode(bool body_can_be_zero_length)
981 : ChoiceNode(2),
982 loop_node_(NULL),
983 continue_node_(NULL),
984 body_can_be_zero_length_(body_can_be_zero_length) { }
985 void AddLoopAlternative(GuardedAlternative alt);
986 void AddContinueAlternative(GuardedAlternative alt);
ager@chromium.org32912102009-01-16 10:38:43 +0000987 virtual bool Emit(RegExpCompiler* compiler, Trace* trace);
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000988 virtual int EatsAtLeast(int recursion_depth); // Returns 0.
989 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
990 RegExpCompiler* compiler,
991 int characters_filled_in);
ager@chromium.org8bb60582008-12-11 12:02:20 +0000992 virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +0000993 RegExpNode* loop_node() { return loop_node_; }
994 RegExpNode* continue_node() { return continue_node_; }
995 bool body_can_be_zero_length() { return body_can_be_zero_length_; }
996 virtual void Accept(NodeVisitor* visitor);
997
998 private:
999 // AddAlternative is made private for loop nodes because alternatives
1000 // should not be added freely, we need to keep track of which node
1001 // goes back to the node itself.
1002 void AddAlternative(GuardedAlternative node) {
1003 ChoiceNode::AddAlternative(node);
1004 }
1005
1006 RegExpNode* loop_node_;
1007 RegExpNode* continue_node_;
1008 bool body_can_be_zero_length_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001009};
1010
1011
1012// There are many ways to generate code for a node. This class encapsulates
1013// the current way we should be generating. In other words it encapsulates
ager@chromium.org32912102009-01-16 10:38:43 +00001014// the current state of the code generator. The effect of this is that we
1015// generate code for paths that the matcher can take through the regular
1016// expression. A given node in the regexp can be code-generated several times
1017// as it can be part of several traces. For example for the regexp:
1018// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1019// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1020// to match foo is generated only once (the traces have a common prefix). The
1021// code to store the capture is deferred and generated (twice) after the places
1022// where baz has been matched.
1023class Trace {
ager@chromium.org8bb60582008-12-11 12:02:20 +00001024 public:
1025 class DeferredAction {
1026 public:
1027 DeferredAction(ActionNode::Type type, int reg)
1028 : type_(type), reg_(reg), next_(NULL) { }
1029 DeferredAction* next() { return next_; }
ager@chromium.org32912102009-01-16 10:38:43 +00001030 bool Mentions(int reg);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001031 int reg() { return reg_; }
1032 ActionNode::Type type() { return type_; }
1033 private:
1034 ActionNode::Type type_;
1035 int reg_;
1036 DeferredAction* next_;
ager@chromium.org32912102009-01-16 10:38:43 +00001037 friend class Trace;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001038 };
1039
1040 class DeferredCapture: public DeferredAction {
1041 public:
ager@chromium.org32912102009-01-16 10:38:43 +00001042 DeferredCapture(int reg, Trace* trace)
ager@chromium.org8bb60582008-12-11 12:02:20 +00001043 : DeferredAction(ActionNode::STORE_POSITION, reg),
ager@chromium.org32912102009-01-16 10:38:43 +00001044 cp_offset_(trace->cp_offset()) { }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001045 int cp_offset() { return cp_offset_; }
1046 private:
1047 int cp_offset_;
1048 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1049 };
1050
1051 class DeferredSetRegister :public DeferredAction {
1052 public:
1053 DeferredSetRegister(int reg, int value)
1054 : DeferredAction(ActionNode::SET_REGISTER, reg),
1055 value_(value) { }
1056 int value() { return value_; }
1057 private:
1058 int value_;
1059 };
1060
ager@chromium.org32912102009-01-16 10:38:43 +00001061 class DeferredClearCaptures : public DeferredAction {
1062 public:
1063 explicit DeferredClearCaptures(Interval range)
1064 : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1065 range_(range) { }
1066 Interval range() { return range_; }
1067 private:
1068 Interval range_;
1069 };
1070
ager@chromium.org8bb60582008-12-11 12:02:20 +00001071 class DeferredIncrementRegister: public DeferredAction {
1072 public:
1073 explicit DeferredIncrementRegister(int reg)
1074 : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1075 };
1076
ager@chromium.org32912102009-01-16 10:38:43 +00001077 Trace()
ager@chromium.org8bb60582008-12-11 12:02:20 +00001078 : cp_offset_(0),
1079 actions_(NULL),
1080 backtrack_(NULL),
1081 stop_node_(NULL),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001082 loop_label_(NULL),
1083 characters_preloaded_(0),
1084 bound_checked_up_to_(0) { }
ager@chromium.org32912102009-01-16 10:38:43 +00001085 // End the trace. This involves flushing the deferred actions in the trace
1086 // and pushing a backtrack location onto the backtrack stack. Once this is
1087 // done we can start a new trace or go to one that has already been
1088 // generated.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001089 bool Flush(RegExpCompiler* compiler, RegExpNode* successor);
1090 int cp_offset() { return cp_offset_; }
1091 DeferredAction* actions() { return actions_; }
ager@chromium.org32912102009-01-16 10:38:43 +00001092 // A trivial trace is one that has no deferred actions or other state that
1093 // affects the assumptions used when generating code. There is no recorded
1094 // backtrack location in a trivial trace, so with a trivial trace we will
1095 // generate code that, on a failure to match, gets the backtrack location
1096 // from the backtrack stack rather than using a direct jump instruction. We
1097 // always start code generation with a trivial trace and non-trivial traces
1098 // are created as we emit code for nodes or add to the list of deferred
1099 // actions in the trace. The location of the code generated for a node using
1100 // a trivial trace is recorded in a label in the node so that gotos can be
1101 // generated to that code.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001102 bool is_trivial() {
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001103 return backtrack_ == NULL &&
1104 actions_ == NULL &&
1105 cp_offset_ == 0 &&
1106 characters_preloaded_ == 0 &&
1107 bound_checked_up_to_ == 0 &&
1108 quick_check_performed_.characters() == 0;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001109 }
1110 Label* backtrack() { return backtrack_; }
1111 Label* loop_label() { return loop_label_; }
1112 RegExpNode* stop_node() { return stop_node_; }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001113 int characters_preloaded() { return characters_preloaded_; }
1114 int bound_checked_up_to() { return bound_checked_up_to_; }
1115 QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1116 bool mentions_reg(int reg);
ager@chromium.org32912102009-01-16 10:38:43 +00001117 // Returns true if a deferred position store exists to the specified
1118 // register and stores the offset in the out-parameter. Otherwise
1119 // returns false.
1120 bool GetStoredPosition(int reg, int* cp_offset);
1121 // These set methods and AdvanceCurrentPositionInTrace should be used only on
1122 // new traces - the intention is that traces are immutable after creation.
ager@chromium.org8bb60582008-12-11 12:02:20 +00001123 void add_action(DeferredAction* new_action) {
1124 ASSERT(new_action->next_ == NULL);
1125 new_action->next_ = actions_;
1126 actions_ = new_action;
1127 }
ager@chromium.org8bb60582008-12-11 12:02:20 +00001128 void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1129 void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1130 void set_loop_label(Label* label) { loop_label_ = label; }
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001131 void set_characters_preloaded(int cpre) { characters_preloaded_ = cpre; }
1132 void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1133 void set_quick_check_performed(QuickCheckDetails* d) {
1134 quick_check_performed_ = *d;
1135 }
1136 void clear_quick_check_performed() {
1137 }
ager@chromium.org32912102009-01-16 10:38:43 +00001138 void AdvanceCurrentPositionInTrace(int by, bool ascii);
ager@chromium.org8bb60582008-12-11 12:02:20 +00001139 private:
1140 int FindAffectedRegisters(OutSet* affected_registers);
1141 void PerformDeferredActions(RegExpMacroAssembler* macro,
1142 int max_register,
1143 OutSet& affected_registers);
1144 void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1145 int max_register,
1146 OutSet& affected_registers);
1147 void PushAffectedRegisters(RegExpMacroAssembler* macro,
1148 int max_register,
1149 OutSet& affected_registers);
1150 int cp_offset_;
1151 DeferredAction* actions_;
1152 Label* backtrack_;
1153 RegExpNode* stop_node_;
1154 Label* loop_label_;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001155 int characters_preloaded_;
1156 int bound_checked_up_to_;
1157 QuickCheckDetails quick_check_performed_;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001158};
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001159
1160
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001161class NodeVisitor {
1162 public:
1163 virtual ~NodeVisitor() { }
1164#define DECLARE_VISIT(Type) \
1165 virtual void Visit##Type(Type##Node* that) = 0;
1166FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1167#undef DECLARE_VISIT
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001168 virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001169};
1170
1171
1172// Node visitor used to add the start set of the alternatives to the
1173// dispatch table of a choice node.
1174class DispatchTableConstructor: public NodeVisitor {
1175 public:
1176 DispatchTableConstructor(DispatchTable* table, bool ignore_case)
1177 : table_(table),
1178 choice_index_(-1),
1179 ignore_case_(ignore_case) { }
1180
1181 void BuildTable(ChoiceNode* node);
1182
1183 void AddRange(CharacterRange range) {
1184 table()->AddRange(range, choice_index_);
1185 }
1186
1187 void AddInverse(ZoneList<CharacterRange>* ranges);
1188
1189#define DECLARE_VISIT(Type) \
1190 virtual void Visit##Type(Type##Node* that);
1191FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1192#undef DECLARE_VISIT
1193
1194 DispatchTable* table() { return table_; }
1195 void set_choice_index(int value) { choice_index_ = value; }
1196
1197 protected:
1198 DispatchTable *table_;
1199 int choice_index_;
1200 bool ignore_case_;
1201};
1202
1203
ager@chromium.org8bb60582008-12-11 12:02:20 +00001204// Assertion propagation moves information about assertions such as
1205// \b to the affected nodes. For instance, in /.\b./ information must
1206// be propagated to the first '.' that whatever follows needs to know
1207// if it matched a word or a non-word, and to the second '.' that it
1208// has to check if it succeeds a word or non-word. In this case the
1209// result will be something like:
1210//
1211// +-------+ +------------+
1212// | . | | . |
1213// +-------+ ---> +------------+
1214// | word? | | check word |
1215// +-------+ +------------+
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001216class Analysis: public NodeVisitor {
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001217 public:
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001218 explicit Analysis(bool ignore_case)
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001219 : ignore_case_(ignore_case) { }
1220 void EnsureAnalyzed(RegExpNode* node);
1221
1222#define DECLARE_VISIT(Type) \
1223 virtual void Visit##Type(Type##Node* that);
1224FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1225#undef DECLARE_VISIT
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001226 virtual void VisitLoopChoice(LoopChoiceNode* that);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001227
1228 private:
1229 bool ignore_case_;
1230
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001231 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001232};
1233
1234
ager@chromium.org8bb60582008-12-11 12:02:20 +00001235struct RegExpCompileData {
1236 RegExpCompileData()
1237 : tree(NULL),
1238 node(NULL),
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001239 simple(true),
ager@chromium.org8bb60582008-12-11 12:02:20 +00001240 capture_count(0) { }
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001241 RegExpTree* tree;
ager@chromium.org8bb60582008-12-11 12:02:20 +00001242 RegExpNode* node;
christian.plesner.hansen@gmail.com37abdec2009-01-06 14:43:28 +00001243 bool simple;
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001244 Handle<String> error;
1245 int capture_count;
1246};
1247
1248
1249class RegExpEngine: public AllStatic {
1250 public:
ager@chromium.org8bb60582008-12-11 12:02:20 +00001251 static Handle<FixedArray> Compile(RegExpCompileData* input,
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001252 bool ignore_case,
ager@chromium.org8bb60582008-12-11 12:02:20 +00001253 bool multiline,
1254 Handle<String> pattern,
1255 bool is_ascii);
1256
ager@chromium.orga74f0da2008-12-03 16:05:52 +00001257 static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1258};
1259
1260
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00001261} } // namespace v8::internal
1262
1263#endif // V8_JSREGEXP_H_