blob: 0e718d3b4d15ecda1e6d56b3bc326ae5539a80e3 [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_AST_H_
6#define V8_REGEXP_REGEXP_AST_H_
7
Ben Murdoch097c5b22016-05-18 11:27:45 +01008#include "src/objects.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00009#include "src/utils.h"
10#include "src/zone.h"
11
12namespace v8 {
13namespace internal {
14
15#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
16 VISIT(Disjunction) \
17 VISIT(Alternative) \
18 VISIT(Assertion) \
19 VISIT(CharacterClass) \
20 VISIT(Atom) \
21 VISIT(Quantifier) \
22 VISIT(Capture) \
23 VISIT(Lookaround) \
24 VISIT(BackReference) \
25 VISIT(Empty) \
26 VISIT(Text)
27
28
29#define FORWARD_DECLARE(Name) class RegExp##Name;
30FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
31#undef FORWARD_DECLARE
32
33class RegExpCompiler;
34class RegExpNode;
35class RegExpTree;
36
37
38class RegExpVisitor BASE_EMBEDDED {
39 public:
40 virtual ~RegExpVisitor() {}
41#define MAKE_CASE(Name) \
42 virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
43 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
44#undef MAKE_CASE
45};
46
47
48// A simple closed interval.
49class Interval {
50 public:
51 Interval() : from_(kNone), to_(kNone) {}
52 Interval(int from, int to) : from_(from), to_(to) {}
53 Interval Union(Interval that) {
54 if (that.from_ == kNone)
55 return *this;
56 else if (from_ == kNone)
57 return that;
58 else
59 return Interval(Min(from_, that.from_), Max(to_, that.to_));
60 }
61 bool Contains(int value) { return (from_ <= value) && (value <= to_); }
62 bool is_empty() { return from_ == kNone; }
63 int from() const { return from_; }
64 int to() const { return to_; }
65 static Interval Empty() { return Interval(); }
66 static const int kNone = -1;
67
68 private:
69 int from_;
70 int to_;
71};
72
73
74// Represents code units in the range from from_ to to_, both ends are
75// inclusive.
76class CharacterRange {
77 public:
78 CharacterRange() : from_(0), to_(0) {}
79 // For compatibility with the CHECK_OK macro
80 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000081 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
82 Zone* zone);
83 static Vector<const int> GetWordBounds();
Ben Murdoch097c5b22016-05-18 11:27:45 +010084 static inline CharacterRange Singleton(uc32 value) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000085 return CharacterRange(value, value);
86 }
Ben Murdoch097c5b22016-05-18 11:27:45 +010087 static inline CharacterRange Range(uc32 from, uc32 to) {
88 DCHECK(0 <= from && to <= String::kMaxCodePoint);
89 DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000090 return CharacterRange(from, to);
91 }
92 static inline CharacterRange Everything() {
Ben Murdoch097c5b22016-05-18 11:27:45 +010093 return CharacterRange(0, String::kMaxCodePoint);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000094 }
Ben Murdoch097c5b22016-05-18 11:27:45 +010095 static inline ZoneList<CharacterRange>* List(Zone* zone,
96 CharacterRange range) {
97 ZoneList<CharacterRange>* list =
98 new (zone) ZoneList<CharacterRange>(1, zone);
99 list->Add(range, zone);
100 return list;
101 }
102 bool Contains(uc32 i) { return from_ <= i && i <= to_; }
103 uc32 from() const { return from_; }
104 void set_from(uc32 value) { from_ = value; }
105 uc32 to() const { return to_; }
106 void set_to(uc32 value) { to_ = value; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000107 bool is_valid() { return from_ <= to_; }
108 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
109 bool IsSingleton() { return (from_ == to_); }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100110 static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
111 ZoneList<CharacterRange>* ranges,
112 bool is_one_byte);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000113 // Whether a range list is in canonical form: Ranges ordered by from value,
114 // and ranges non-overlapping and non-adjacent.
115 static bool IsCanonical(ZoneList<CharacterRange>* ranges);
116 // Convert range list to canonical form. The characters covered by the ranges
117 // will still be the same, but no character is in more than one range, and
118 // adjacent ranges are merged. The resulting list may be shorter than the
119 // original, but cannot be longer.
120 static void Canonicalize(ZoneList<CharacterRange>* ranges);
121 // Negate the contents of a character range in canonical form.
122 static void Negate(ZoneList<CharacterRange>* src,
123 ZoneList<CharacterRange>* dst, Zone* zone);
124 static const int kStartMarker = (1 << 24);
125 static const int kPayloadMask = (1 << 24) - 1;
126
127 private:
Ben Murdoch097c5b22016-05-18 11:27:45 +0100128 CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
129
130 uc32 from_;
131 uc32 to_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000132};
133
134
135class CharacterSet final BASE_EMBEDDED {
136 public:
137 explicit CharacterSet(uc16 standard_set_type)
138 : ranges_(NULL), standard_set_type_(standard_set_type) {}
139 explicit CharacterSet(ZoneList<CharacterRange>* ranges)
140 : ranges_(ranges), standard_set_type_(0) {}
141 ZoneList<CharacterRange>* ranges(Zone* zone);
142 uc16 standard_set_type() { return standard_set_type_; }
143 void set_standard_set_type(uc16 special_set_type) {
144 standard_set_type_ = special_set_type;
145 }
146 bool is_standard() { return standard_set_type_ != 0; }
147 void Canonicalize();
148
149 private:
150 ZoneList<CharacterRange>* ranges_;
151 // If non-zero, the value represents a standard set (e.g., all whitespace
152 // characters) without having to expand the ranges.
153 uc16 standard_set_type_;
154};
155
156
157class TextElement final BASE_EMBEDDED {
158 public:
159 enum TextType { ATOM, CHAR_CLASS };
160
161 static TextElement Atom(RegExpAtom* atom);
162 static TextElement CharClass(RegExpCharacterClass* char_class);
163
164 int cp_offset() const { return cp_offset_; }
165 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
166 int length() const;
167
168 TextType text_type() const { return text_type_; }
169
170 RegExpTree* tree() const { return tree_; }
171
172 RegExpAtom* atom() const {
173 DCHECK(text_type() == ATOM);
174 return reinterpret_cast<RegExpAtom*>(tree());
175 }
176
177 RegExpCharacterClass* char_class() const {
178 DCHECK(text_type() == CHAR_CLASS);
179 return reinterpret_cast<RegExpCharacterClass*>(tree());
180 }
181
182 private:
183 TextElement(TextType text_type, RegExpTree* tree)
184 : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
185
186 int cp_offset_;
187 TextType text_type_;
188 RegExpTree* tree_;
189};
190
191
192class RegExpTree : public ZoneObject {
193 public:
194 static const int kInfinity = kMaxInt;
195 virtual ~RegExpTree() {}
196 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
197 virtual RegExpNode* ToNode(RegExpCompiler* compiler,
198 RegExpNode* on_success) = 0;
199 virtual bool IsTextElement() { return false; }
200 virtual bool IsAnchoredAtStart() { return false; }
201 virtual bool IsAnchoredAtEnd() { return false; }
202 virtual int min_match() = 0;
203 virtual int max_match() = 0;
204 // Returns the interval of registers used for captures within this
205 // expression.
206 virtual Interval CaptureRegisters() { return Interval::Empty(); }
207 virtual void AppendToText(RegExpText* text, Zone* zone);
208 std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT
209#define MAKE_ASTYPE(Name) \
210 virtual RegExp##Name* As##Name(); \
211 virtual bool Is##Name();
212 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
213#undef MAKE_ASTYPE
214};
215
216
217class RegExpDisjunction final : public RegExpTree {
218 public:
219 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
220 void* Accept(RegExpVisitor* visitor, void* data) override;
221 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
222 RegExpDisjunction* AsDisjunction() override;
223 Interval CaptureRegisters() override;
224 bool IsDisjunction() override;
225 bool IsAnchoredAtStart() override;
226 bool IsAnchoredAtEnd() override;
227 int min_match() override { return min_match_; }
228 int max_match() override { return max_match_; }
229 ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
230
231 private:
232 bool SortConsecutiveAtoms(RegExpCompiler* compiler);
233 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
234 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
235 ZoneList<RegExpTree*>* alternatives_;
236 int min_match_;
237 int max_match_;
238};
239
240
241class RegExpAlternative final : public RegExpTree {
242 public:
243 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
244 void* Accept(RegExpVisitor* visitor, void* data) override;
245 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
246 RegExpAlternative* AsAlternative() override;
247 Interval CaptureRegisters() override;
248 bool IsAlternative() override;
249 bool IsAnchoredAtStart() override;
250 bool IsAnchoredAtEnd() override;
251 int min_match() override { return min_match_; }
252 int max_match() override { return max_match_; }
253 ZoneList<RegExpTree*>* nodes() { return nodes_; }
254
255 private:
256 ZoneList<RegExpTree*>* nodes_;
257 int min_match_;
258 int max_match_;
259};
260
261
262class RegExpAssertion final : public RegExpTree {
263 public:
264 enum AssertionType {
265 START_OF_LINE,
266 START_OF_INPUT,
267 END_OF_LINE,
268 END_OF_INPUT,
269 BOUNDARY,
270 NON_BOUNDARY
271 };
272 explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
273 void* Accept(RegExpVisitor* visitor, void* data) override;
274 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
275 RegExpAssertion* AsAssertion() override;
276 bool IsAssertion() override;
277 bool IsAnchoredAtStart() override;
278 bool IsAnchoredAtEnd() override;
279 int min_match() override { return 0; }
280 int max_match() override { return 0; }
281 AssertionType assertion_type() { return assertion_type_; }
282
283 private:
284 AssertionType assertion_type_;
285};
286
287
288class RegExpCharacterClass final : public RegExpTree {
289 public:
290 RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
291 : set_(ranges), is_negated_(is_negated) {}
292 explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
293 void* Accept(RegExpVisitor* visitor, void* data) override;
294 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
295 RegExpCharacterClass* AsCharacterClass() override;
296 bool IsCharacterClass() override;
297 bool IsTextElement() override { return true; }
298 int min_match() override { return 1; }
299 int max_match() override { return 1; }
300 void AppendToText(RegExpText* text, Zone* zone) override;
301 CharacterSet character_set() { return set_; }
302 // TODO(lrn): Remove need for complex version if is_standard that
303 // recognizes a mangled standard set and just do { return set_.is_special(); }
304 bool is_standard(Zone* zone);
305 // Returns a value representing the standard character set if is_standard()
306 // returns true.
307 // Currently used values are:
308 // s : unicode whitespace
309 // S : unicode non-whitespace
310 // w : ASCII word character (digit, letter, underscore)
311 // W : non-ASCII word character
312 // d : ASCII digit
313 // D : non-ASCII digit
Ben Murdoch097c5b22016-05-18 11:27:45 +0100314 // . : non-newline
315 // * : All characters, for advancing unanchored regexp
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000316 uc16 standard_type() { return set_.standard_set_type(); }
317 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
318 bool is_negated() { return is_negated_; }
319
320 private:
321 CharacterSet set_;
322 bool is_negated_;
323};
324
325
326class RegExpAtom final : public RegExpTree {
327 public:
328 explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
329 void* Accept(RegExpVisitor* visitor, void* data) override;
330 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
331 RegExpAtom* AsAtom() override;
332 bool IsAtom() override;
333 bool IsTextElement() override { return true; }
334 int min_match() override { return data_.length(); }
335 int max_match() override { return data_.length(); }
336 void AppendToText(RegExpText* text, Zone* zone) override;
337 Vector<const uc16> data() { return data_; }
338 int length() { return data_.length(); }
339
340 private:
341 Vector<const uc16> data_;
342};
343
344
345class RegExpText final : public RegExpTree {
346 public:
347 explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
348 void* Accept(RegExpVisitor* visitor, void* data) override;
349 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
350 RegExpText* AsText() override;
351 bool IsText() override;
352 bool IsTextElement() override { return true; }
353 int min_match() override { return length_; }
354 int max_match() override { return length_; }
355 void AppendToText(RegExpText* text, Zone* zone) override;
356 void AddElement(TextElement elm, Zone* zone) {
357 elements_.Add(elm, zone);
358 length_ += elm.length();
359 }
360 ZoneList<TextElement>* elements() { return &elements_; }
361
362 private:
363 ZoneList<TextElement> elements_;
364 int length_;
365};
366
367
368class RegExpQuantifier final : public RegExpTree {
369 public:
370 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
371 RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
372 : body_(body),
373 min_(min),
374 max_(max),
375 min_match_(min * body->min_match()),
376 quantifier_type_(type) {
377 if (max > 0 && body->max_match() > kInfinity / max) {
378 max_match_ = kInfinity;
379 } else {
380 max_match_ = max * body->max_match();
381 }
382 }
383 void* Accept(RegExpVisitor* visitor, void* data) override;
384 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
385 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
386 RegExpCompiler* compiler, RegExpNode* on_success,
387 bool not_at_start = false);
388 RegExpQuantifier* AsQuantifier() override;
389 Interval CaptureRegisters() override;
390 bool IsQuantifier() override;
391 int min_match() override { return min_match_; }
392 int max_match() override { return max_match_; }
393 int min() { return min_; }
394 int max() { return max_; }
395 bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
396 bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
397 bool is_greedy() { return quantifier_type_ == GREEDY; }
398 RegExpTree* body() { return body_; }
399
400 private:
401 RegExpTree* body_;
402 int min_;
403 int max_;
404 int min_match_;
405 int max_match_;
406 QuantifierType quantifier_type_;
407};
408
409
410class RegExpCapture final : public RegExpTree {
411 public:
412 explicit RegExpCapture(int index) : body_(NULL), index_(index) {}
413 void* Accept(RegExpVisitor* visitor, void* data) override;
414 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
415 static RegExpNode* ToNode(RegExpTree* body, int index,
416 RegExpCompiler* compiler, RegExpNode* on_success);
417 RegExpCapture* AsCapture() override;
418 bool IsAnchoredAtStart() override;
419 bool IsAnchoredAtEnd() override;
420 Interval CaptureRegisters() override;
421 bool IsCapture() override;
422 int min_match() override { return body_->min_match(); }
423 int max_match() override { return body_->max_match(); }
424 RegExpTree* body() { return body_; }
425 void set_body(RegExpTree* body) { body_ = body; }
426 int index() { return index_; }
427 static int StartRegister(int index) { return index * 2; }
428 static int EndRegister(int index) { return index * 2 + 1; }
429
430 private:
431 RegExpTree* body_;
432 int index_;
433};
434
435
436class RegExpLookaround final : public RegExpTree {
437 public:
438 enum Type { LOOKAHEAD, LOOKBEHIND };
439
440 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
441 int capture_from, Type type)
442 : body_(body),
443 is_positive_(is_positive),
444 capture_count_(capture_count),
445 capture_from_(capture_from),
446 type_(type) {}
447
448 void* Accept(RegExpVisitor* visitor, void* data) override;
449 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
450 RegExpLookaround* AsLookaround() override;
451 Interval CaptureRegisters() override;
452 bool IsLookaround() override;
453 bool IsAnchoredAtStart() override;
454 int min_match() override { return 0; }
455 int max_match() override { return 0; }
456 RegExpTree* body() { return body_; }
457 bool is_positive() { return is_positive_; }
458 int capture_count() { return capture_count_; }
459 int capture_from() { return capture_from_; }
460 Type type() { return type_; }
461
Ben Murdoch097c5b22016-05-18 11:27:45 +0100462 class Builder {
463 public:
464 Builder(bool is_positive, RegExpNode* on_success,
465 int stack_pointer_register, int position_register,
466 int capture_register_count = 0, int capture_register_start = 0);
467 RegExpNode* on_match_success() { return on_match_success_; }
468 RegExpNode* ForMatch(RegExpNode* match);
469
470 private:
471 bool is_positive_;
472 RegExpNode* on_match_success_;
473 RegExpNode* on_success_;
474 int stack_pointer_register_;
475 int position_register_;
476 };
477
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000478 private:
479 RegExpTree* body_;
480 bool is_positive_;
481 int capture_count_;
482 int capture_from_;
483 Type type_;
484};
485
486
487class RegExpBackReference final : public RegExpTree {
488 public:
489 explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {}
490 void* Accept(RegExpVisitor* visitor, void* data) override;
491 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
492 RegExpBackReference* AsBackReference() override;
493 bool IsBackReference() override;
494 int min_match() override { return 0; }
495 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
496 // recursion, we give up. Ignorance is bliss.
497 int max_match() override { return kInfinity; }
498 int index() { return capture_->index(); }
499 RegExpCapture* capture() { return capture_; }
500
501 private:
502 RegExpCapture* capture_;
503};
504
505
506class RegExpEmpty final : public RegExpTree {
507 public:
508 RegExpEmpty() {}
509 void* Accept(RegExpVisitor* visitor, void* data) override;
510 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
511 RegExpEmpty* AsEmpty() override;
512 bool IsEmpty() override;
513 int min_match() override { return 0; }
514 int max_match() override { return 0; }
515};
516
517} // namespace internal
518} // namespace v8
519
520#endif // V8_REGEXP_REGEXP_AST_H_