blob: 406bf84233b3c5e155ec85cf9f67b64260084422 [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_AST_H_
6#define V8_REGEXP_REGEXP_AST_H_
7
Ben Murdoch097c5b22016-05-18 11:27:45 +01008#include "src/objects.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00009#include "src/utils.h"
Ben Murdoch61f157c2016-09-16 13:49:30 +010010#include "src/zone-containers.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000011#include "src/zone.h"
12
13namespace v8 {
14namespace internal {
15
16#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
17 VISIT(Disjunction) \
18 VISIT(Alternative) \
19 VISIT(Assertion) \
20 VISIT(CharacterClass) \
21 VISIT(Atom) \
22 VISIT(Quantifier) \
23 VISIT(Capture) \
24 VISIT(Lookaround) \
25 VISIT(BackReference) \
26 VISIT(Empty) \
27 VISIT(Text)
28
29
30#define FORWARD_DECLARE(Name) class RegExp##Name;
31FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
32#undef FORWARD_DECLARE
33
34class RegExpCompiler;
35class RegExpNode;
36class RegExpTree;
37
38
39class RegExpVisitor BASE_EMBEDDED {
40 public:
41 virtual ~RegExpVisitor() {}
42#define MAKE_CASE(Name) \
43 virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
44 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
45#undef MAKE_CASE
46};
47
48
49// A simple closed interval.
50class Interval {
51 public:
52 Interval() : from_(kNone), to_(kNone) {}
53 Interval(int from, int to) : from_(from), to_(to) {}
54 Interval Union(Interval that) {
55 if (that.from_ == kNone)
56 return *this;
57 else if (from_ == kNone)
58 return that;
59 else
60 return Interval(Min(from_, that.from_), Max(to_, that.to_));
61 }
62 bool Contains(int value) { return (from_ <= value) && (value <= to_); }
63 bool is_empty() { return from_ == kNone; }
64 int from() const { return from_; }
65 int to() const { return to_; }
66 static Interval Empty() { return Interval(); }
67 static const int kNone = -1;
68
69 private:
70 int from_;
71 int to_;
72};
73
74
75// Represents code units in the range from from_ to to_, both ends are
76// inclusive.
77class CharacterRange {
78 public:
79 CharacterRange() : from_(0), to_(0) {}
80 // For compatibility with the CHECK_OK macro
81 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000082 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
83 Zone* zone);
84 static Vector<const int> GetWordBounds();
Ben Murdoch097c5b22016-05-18 11:27:45 +010085 static inline CharacterRange Singleton(uc32 value) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000086 return CharacterRange(value, value);
87 }
Ben Murdoch097c5b22016-05-18 11:27:45 +010088 static inline CharacterRange Range(uc32 from, uc32 to) {
89 DCHECK(0 <= from && to <= String::kMaxCodePoint);
90 DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000091 return CharacterRange(from, to);
92 }
93 static inline CharacterRange Everything() {
Ben Murdoch097c5b22016-05-18 11:27:45 +010094 return CharacterRange(0, String::kMaxCodePoint);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000095 }
Ben Murdoch097c5b22016-05-18 11:27:45 +010096 static inline ZoneList<CharacterRange>* List(Zone* zone,
97 CharacterRange range) {
98 ZoneList<CharacterRange>* list =
99 new (zone) ZoneList<CharacterRange>(1, zone);
100 list->Add(range, zone);
101 return list;
102 }
103 bool Contains(uc32 i) { return from_ <= i && i <= to_; }
104 uc32 from() const { return from_; }
105 void set_from(uc32 value) { from_ = value; }
106 uc32 to() const { return to_; }
107 void set_to(uc32 value) { to_ = value; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000108 bool is_valid() { return from_ <= to_; }
109 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
110 bool IsSingleton() { return (from_ == to_); }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100111 static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
112 ZoneList<CharacterRange>* ranges,
113 bool is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000114 // Whether a range list is in canonical form: Ranges ordered by from value,
115 // and ranges non-overlapping and non-adjacent.
116 static bool IsCanonical(ZoneList<CharacterRange>* ranges);
117 // Convert range list to canonical form. The characters covered by the ranges
118 // will still be the same, but no character is in more than one range, and
119 // adjacent ranges are merged. The resulting list may be shorter than the
120 // original, but cannot be longer.
121 static void Canonicalize(ZoneList<CharacterRange>* ranges);
122 // Negate the contents of a character range in canonical form.
123 static void Negate(ZoneList<CharacterRange>* src,
124 ZoneList<CharacterRange>* dst, Zone* zone);
125 static const int kStartMarker = (1 << 24);
126 static const int kPayloadMask = (1 << 24) - 1;
127
128 private:
Ben Murdoch097c5b22016-05-18 11:27:45 +0100129 CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
130
131 uc32 from_;
132 uc32 to_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000133};
134
135
136class CharacterSet final BASE_EMBEDDED {
137 public:
138 explicit CharacterSet(uc16 standard_set_type)
139 : ranges_(NULL), standard_set_type_(standard_set_type) {}
140 explicit CharacterSet(ZoneList<CharacterRange>* ranges)
141 : ranges_(ranges), standard_set_type_(0) {}
142 ZoneList<CharacterRange>* ranges(Zone* zone);
143 uc16 standard_set_type() { return standard_set_type_; }
144 void set_standard_set_type(uc16 special_set_type) {
145 standard_set_type_ = special_set_type;
146 }
147 bool is_standard() { return standard_set_type_ != 0; }
148 void Canonicalize();
149
150 private:
151 ZoneList<CharacterRange>* ranges_;
152 // If non-zero, the value represents a standard set (e.g., all whitespace
153 // characters) without having to expand the ranges.
154 uc16 standard_set_type_;
155};
156
157
158class TextElement final BASE_EMBEDDED {
159 public:
160 enum TextType { ATOM, CHAR_CLASS };
161
162 static TextElement Atom(RegExpAtom* atom);
163 static TextElement CharClass(RegExpCharacterClass* char_class);
164
165 int cp_offset() const { return cp_offset_; }
166 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
167 int length() const;
168
169 TextType text_type() const { return text_type_; }
170
171 RegExpTree* tree() const { return tree_; }
172
173 RegExpAtom* atom() const {
174 DCHECK(text_type() == ATOM);
175 return reinterpret_cast<RegExpAtom*>(tree());
176 }
177
178 RegExpCharacterClass* char_class() const {
179 DCHECK(text_type() == CHAR_CLASS);
180 return reinterpret_cast<RegExpCharacterClass*>(tree());
181 }
182
183 private:
184 TextElement(TextType text_type, RegExpTree* tree)
185 : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
186
187 int cp_offset_;
188 TextType text_type_;
189 RegExpTree* tree_;
190};
191
192
193class RegExpTree : public ZoneObject {
194 public:
195 static const int kInfinity = kMaxInt;
196 virtual ~RegExpTree() {}
197 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
198 virtual RegExpNode* ToNode(RegExpCompiler* compiler,
199 RegExpNode* on_success) = 0;
200 virtual bool IsTextElement() { return false; }
201 virtual bool IsAnchoredAtStart() { return false; }
202 virtual bool IsAnchoredAtEnd() { return false; }
203 virtual int min_match() = 0;
204 virtual int max_match() = 0;
205 // Returns the interval of registers used for captures within this
206 // expression.
207 virtual Interval CaptureRegisters() { return Interval::Empty(); }
208 virtual void AppendToText(RegExpText* text, Zone* zone);
209 std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT
210#define MAKE_ASTYPE(Name) \
211 virtual RegExp##Name* As##Name(); \
212 virtual bool Is##Name();
213 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
214#undef MAKE_ASTYPE
215};
216
217
218class RegExpDisjunction final : public RegExpTree {
219 public:
220 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
221 void* Accept(RegExpVisitor* visitor, void* data) override;
222 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
223 RegExpDisjunction* AsDisjunction() override;
224 Interval CaptureRegisters() override;
225 bool IsDisjunction() override;
226 bool IsAnchoredAtStart() override;
227 bool IsAnchoredAtEnd() override;
228 int min_match() override { return min_match_; }
229 int max_match() override { return max_match_; }
230 ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
231
232 private:
233 bool SortConsecutiveAtoms(RegExpCompiler* compiler);
234 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
235 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
236 ZoneList<RegExpTree*>* alternatives_;
237 int min_match_;
238 int max_match_;
239};
240
241
242class RegExpAlternative final : public RegExpTree {
243 public:
244 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
245 void* Accept(RegExpVisitor* visitor, void* data) override;
246 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
247 RegExpAlternative* AsAlternative() override;
248 Interval CaptureRegisters() override;
249 bool IsAlternative() override;
250 bool IsAnchoredAtStart() override;
251 bool IsAnchoredAtEnd() override;
252 int min_match() override { return min_match_; }
253 int max_match() override { return max_match_; }
254 ZoneList<RegExpTree*>* nodes() { return nodes_; }
255
256 private:
257 ZoneList<RegExpTree*>* nodes_;
258 int min_match_;
259 int max_match_;
260};
261
262
263class RegExpAssertion final : public RegExpTree {
264 public:
265 enum AssertionType {
266 START_OF_LINE,
267 START_OF_INPUT,
268 END_OF_LINE,
269 END_OF_INPUT,
270 BOUNDARY,
271 NON_BOUNDARY
272 };
273 explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
274 void* Accept(RegExpVisitor* visitor, void* data) override;
275 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
276 RegExpAssertion* AsAssertion() override;
277 bool IsAssertion() override;
278 bool IsAnchoredAtStart() override;
279 bool IsAnchoredAtEnd() override;
280 int min_match() override { return 0; }
281 int max_match() override { return 0; }
282 AssertionType assertion_type() { return assertion_type_; }
283
284 private:
285 AssertionType assertion_type_;
286};
287
288
289class RegExpCharacterClass final : public RegExpTree {
290 public:
291 RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
292 : set_(ranges), is_negated_(is_negated) {}
293 explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
294 void* Accept(RegExpVisitor* visitor, void* data) override;
295 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
296 RegExpCharacterClass* AsCharacterClass() override;
297 bool IsCharacterClass() override;
298 bool IsTextElement() override { return true; }
299 int min_match() override { return 1; }
Ben Murdochc5610432016-08-08 18:44:38 +0100300 // The character class may match two code units for unicode regexps.
301 // TODO(yangguo): we should split this class for usage in TextElement, and
302 // make max_match() dependent on the character class content.
303 int max_match() override { return 2; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000304 void AppendToText(RegExpText* text, Zone* zone) override;
305 CharacterSet character_set() { return set_; }
306 // TODO(lrn): Remove need for complex version if is_standard that
307 // recognizes a mangled standard set and just do { return set_.is_special(); }
308 bool is_standard(Zone* zone);
309 // Returns a value representing the standard character set if is_standard()
310 // returns true.
311 // Currently used values are:
312 // s : unicode whitespace
313 // S : unicode non-whitespace
314 // w : ASCII word character (digit, letter, underscore)
315 // W : non-ASCII word character
316 // d : ASCII digit
317 // D : non-ASCII digit
Ben Murdoch097c5b22016-05-18 11:27:45 +0100318 // . : non-newline
319 // * : All characters, for advancing unanchored regexp
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000320 uc16 standard_type() { return set_.standard_set_type(); }
321 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
322 bool is_negated() { return is_negated_; }
323
324 private:
325 CharacterSet set_;
326 bool is_negated_;
327};
328
329
330class RegExpAtom final : public RegExpTree {
331 public:
332 explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
333 void* Accept(RegExpVisitor* visitor, void* data) override;
334 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
335 RegExpAtom* AsAtom() override;
336 bool IsAtom() override;
337 bool IsTextElement() override { return true; }
338 int min_match() override { return data_.length(); }
339 int max_match() override { return data_.length(); }
340 void AppendToText(RegExpText* text, Zone* zone) override;
341 Vector<const uc16> data() { return data_; }
342 int length() { return data_.length(); }
343
344 private:
345 Vector<const uc16> data_;
346};
347
348
349class RegExpText final : public RegExpTree {
350 public:
351 explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
352 void* Accept(RegExpVisitor* visitor, void* data) override;
353 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
354 RegExpText* AsText() override;
355 bool IsText() override;
356 bool IsTextElement() override { return true; }
357 int min_match() override { return length_; }
358 int max_match() override { return length_; }
359 void AppendToText(RegExpText* text, Zone* zone) override;
360 void AddElement(TextElement elm, Zone* zone) {
361 elements_.Add(elm, zone);
362 length_ += elm.length();
363 }
364 ZoneList<TextElement>* elements() { return &elements_; }
365
366 private:
367 ZoneList<TextElement> elements_;
368 int length_;
369};
370
371
372class RegExpQuantifier final : public RegExpTree {
373 public:
374 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
375 RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
376 : body_(body),
377 min_(min),
378 max_(max),
379 min_match_(min * body->min_match()),
380 quantifier_type_(type) {
381 if (max > 0 && body->max_match() > kInfinity / max) {
382 max_match_ = kInfinity;
383 } else {
384 max_match_ = max * body->max_match();
385 }
386 }
387 void* Accept(RegExpVisitor* visitor, void* data) override;
388 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
389 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
390 RegExpCompiler* compiler, RegExpNode* on_success,
391 bool not_at_start = false);
392 RegExpQuantifier* AsQuantifier() override;
393 Interval CaptureRegisters() override;
394 bool IsQuantifier() override;
395 int min_match() override { return min_match_; }
396 int max_match() override { return max_match_; }
397 int min() { return min_; }
398 int max() { return max_; }
399 bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
400 bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
401 bool is_greedy() { return quantifier_type_ == GREEDY; }
402 RegExpTree* body() { return body_; }
403
404 private:
405 RegExpTree* body_;
406 int min_;
407 int max_;
408 int min_match_;
409 int max_match_;
410 QuantifierType quantifier_type_;
411};
412
413
414class RegExpCapture final : public RegExpTree {
415 public:
Ben Murdoch61f157c2016-09-16 13:49:30 +0100416 explicit RegExpCapture(int index)
417 : body_(NULL), index_(index), name_(nullptr) {}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000418 void* Accept(RegExpVisitor* visitor, void* data) override;
419 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
420 static RegExpNode* ToNode(RegExpTree* body, int index,
421 RegExpCompiler* compiler, RegExpNode* on_success);
422 RegExpCapture* AsCapture() override;
423 bool IsAnchoredAtStart() override;
424 bool IsAnchoredAtEnd() override;
425 Interval CaptureRegisters() override;
426 bool IsCapture() override;
427 int min_match() override { return body_->min_match(); }
428 int max_match() override { return body_->max_match(); }
429 RegExpTree* body() { return body_; }
430 void set_body(RegExpTree* body) { body_ = body; }
431 int index() { return index_; }
Ben Murdoch61f157c2016-09-16 13:49:30 +0100432 const ZoneVector<uc16>* name() const { return name_; }
433 void set_name(const ZoneVector<uc16>* name) { name_ = name; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000434 static int StartRegister(int index) { return index * 2; }
435 static int EndRegister(int index) { return index * 2 + 1; }
436
437 private:
438 RegExpTree* body_;
439 int index_;
Ben Murdoch61f157c2016-09-16 13:49:30 +0100440 const ZoneVector<uc16>* name_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000441};
442
443
444class RegExpLookaround final : public RegExpTree {
445 public:
446 enum Type { LOOKAHEAD, LOOKBEHIND };
447
448 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
449 int capture_from, Type type)
450 : body_(body),
451 is_positive_(is_positive),
452 capture_count_(capture_count),
453 capture_from_(capture_from),
454 type_(type) {}
455
456 void* Accept(RegExpVisitor* visitor, void* data) override;
457 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
458 RegExpLookaround* AsLookaround() override;
459 Interval CaptureRegisters() override;
460 bool IsLookaround() override;
461 bool IsAnchoredAtStart() override;
462 int min_match() override { return 0; }
463 int max_match() override { return 0; }
464 RegExpTree* body() { return body_; }
465 bool is_positive() { return is_positive_; }
466 int capture_count() { return capture_count_; }
467 int capture_from() { return capture_from_; }
468 Type type() { return type_; }
469
Ben Murdoch097c5b22016-05-18 11:27:45 +0100470 class Builder {
471 public:
472 Builder(bool is_positive, RegExpNode* on_success,
473 int stack_pointer_register, int position_register,
474 int capture_register_count = 0, int capture_register_start = 0);
475 RegExpNode* on_match_success() { return on_match_success_; }
476 RegExpNode* ForMatch(RegExpNode* match);
477
478 private:
479 bool is_positive_;
480 RegExpNode* on_match_success_;
481 RegExpNode* on_success_;
482 int stack_pointer_register_;
483 int position_register_;
484 };
485
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000486 private:
487 RegExpTree* body_;
488 bool is_positive_;
489 int capture_count_;
490 int capture_from_;
491 Type type_;
492};
493
494
495class RegExpBackReference final : public RegExpTree {
496 public:
Ben Murdoch61f157c2016-09-16 13:49:30 +0100497 RegExpBackReference() : capture_(nullptr), name_(nullptr) {}
498 explicit RegExpBackReference(RegExpCapture* capture)
499 : capture_(capture), name_(nullptr) {}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000500 void* Accept(RegExpVisitor* visitor, void* data) override;
501 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
502 RegExpBackReference* AsBackReference() override;
503 bool IsBackReference() override;
504 int min_match() override { return 0; }
505 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
506 // recursion, we give up. Ignorance is bliss.
507 int max_match() override { return kInfinity; }
508 int index() { return capture_->index(); }
509 RegExpCapture* capture() { return capture_; }
Ben Murdoch61f157c2016-09-16 13:49:30 +0100510 void set_capture(RegExpCapture* capture) { capture_ = capture; }
511 const ZoneVector<uc16>* name() const { return name_; }
512 void set_name(const ZoneVector<uc16>* name) { name_ = name; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000513
514 private:
515 RegExpCapture* capture_;
Ben Murdoch61f157c2016-09-16 13:49:30 +0100516 const ZoneVector<uc16>* name_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000517};
518
519
520class RegExpEmpty final : public RegExpTree {
521 public:
522 RegExpEmpty() {}
523 void* Accept(RegExpVisitor* visitor, void* data) override;
524 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
525 RegExpEmpty* AsEmpty() override;
526 bool IsEmpty() override;
527 int min_match() override { return 0; }
528 int max_match() override { return 0; }
529};
530
531} // namespace internal
532} // namespace v8
533
534#endif // V8_REGEXP_REGEXP_AST_H_