blob: acf783cc417eae1aceb937b206c0900bb52c7241 [file] [log] [blame]
Ben Murdoch014dc512016-03-22 12:00:34 +00001// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_PARSER_H_
6#define V8_REGEXP_REGEXP_PARSER_H_
7
8#include "src/objects.h"
9#include "src/regexp/regexp-ast.h"
10#include "src/zone.h"
11
12namespace v8 {
13namespace internal {
14
15struct RegExpCompileData;
16
17
18// A BufferedZoneList is an automatically growing list, just like (and backed
19// by) a ZoneList, that is optimized for the case of adding and removing
20// a single element. The last element added is stored outside the backing list,
21// and if no more than one element is ever added, the ZoneList isn't even
22// allocated.
23// Elements must not be NULL pointers.
24template <typename T, int initial_size>
25class BufferedZoneList {
26 public:
27 BufferedZoneList() : list_(NULL), last_(NULL) {}
28
29 // Adds element at end of list. This element is buffered and can
30 // be read using last() or removed using RemoveLast until a new Add or until
31 // RemoveLast or GetList has been called.
32 void Add(T* value, Zone* zone) {
33 if (last_ != NULL) {
34 if (list_ == NULL) {
35 list_ = new (zone) ZoneList<T*>(initial_size, zone);
36 }
37 list_->Add(last_, zone);
38 }
39 last_ = value;
40 }
41
42 T* last() {
43 DCHECK(last_ != NULL);
44 return last_;
45 }
46
47 T* RemoveLast() {
48 DCHECK(last_ != NULL);
49 T* result = last_;
50 if ((list_ != NULL) && (list_->length() > 0))
51 last_ = list_->RemoveLast();
52 else
53 last_ = NULL;
54 return result;
55 }
56
57 T* Get(int i) {
58 DCHECK((0 <= i) && (i < length()));
59 if (list_ == NULL) {
60 DCHECK_EQ(0, i);
61 return last_;
62 } else {
63 if (i == list_->length()) {
64 DCHECK(last_ != NULL);
65 return last_;
66 } else {
67 return list_->at(i);
68 }
69 }
70 }
71
72 void Clear() {
73 list_ = NULL;
74 last_ = NULL;
75 }
76
77 int length() {
78 int length = (list_ == NULL) ? 0 : list_->length();
79 return length + ((last_ == NULL) ? 0 : 1);
80 }
81
82 ZoneList<T*>* GetList(Zone* zone) {
83 if (list_ == NULL) {
84 list_ = new (zone) ZoneList<T*>(initial_size, zone);
85 }
86 if (last_ != NULL) {
87 list_->Add(last_, zone);
88 last_ = NULL;
89 }
90 return list_;
91 }
92
93 private:
94 ZoneList<T*>* list_;
95 T* last_;
96};
97
98
99// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100class RegExpBuilder : public ZoneObject {
101 public:
Ben Murdoch109988c2016-05-18 11:27:45 +0100102 RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
Ben Murdoch014dc512016-03-22 12:00:34 +0000103 void AddCharacter(uc16 character);
104 void AddUnicodeCharacter(uc32 character);
Ben Murdoch109988c2016-05-18 11:27:45 +0100105 void AddEscapedUnicodeCharacter(uc32 character);
Ben Murdoch014dc512016-03-22 12:00:34 +0000106 // "Adds" an empty expression. Does nothing except consume a
107 // following quantifier
108 void AddEmpty();
Ben Murdoch109988c2016-05-18 11:27:45 +0100109 void AddCharacterClass(RegExpCharacterClass* cc);
110 void AddCharacterClassForDesugaring(uc32 c);
Ben Murdoch014dc512016-03-22 12:00:34 +0000111 void AddAtom(RegExpTree* tree);
Ben Murdoch109988c2016-05-18 11:27:45 +0100112 void AddTerm(RegExpTree* tree);
Ben Murdoch014dc512016-03-22 12:00:34 +0000113 void AddAssertion(RegExpTree* tree);
114 void NewAlternative(); // '|'
Ben Murdoch109988c2016-05-18 11:27:45 +0100115 bool AddQuantifierToAtom(int min, int max,
Ben Murdoch014dc512016-03-22 12:00:34 +0000116 RegExpQuantifier::QuantifierType type);
117 RegExpTree* ToRegExp();
118
119 private:
Ben Murdoch109988c2016-05-18 11:27:45 +0100120 static const uc16 kNoPendingSurrogate = 0;
121 void AddLeadSurrogate(uc16 lead_surrogate);
122 void AddTrailSurrogate(uc16 trail_surrogate);
123 void FlushPendingSurrogate();
Ben Murdoch014dc512016-03-22 12:00:34 +0000124 void FlushCharacters();
125 void FlushText();
126 void FlushTerms();
Ben Murdoch109988c2016-05-18 11:27:45 +0100127 bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
128 bool NeedsDesugaringForIgnoreCase(uc32 c);
Ben Murdoch014dc512016-03-22 12:00:34 +0000129 Zone* zone() const { return zone_; }
Ben Murdoch109988c2016-05-18 11:27:45 +0100130 bool ignore_case() const { return ignore_case_; }
131 bool unicode() const { return unicode_; }
Ben Murdoch014dc512016-03-22 12:00:34 +0000132
133 Zone* zone_;
134 bool pending_empty_;
Ben Murdoch109988c2016-05-18 11:27:45 +0100135 bool ignore_case_;
136 bool unicode_;
Ben Murdoch014dc512016-03-22 12:00:34 +0000137 ZoneList<uc16>* characters_;
Ben Murdoch109988c2016-05-18 11:27:45 +0100138 uc16 pending_surrogate_;
Ben Murdoch014dc512016-03-22 12:00:34 +0000139 BufferedZoneList<RegExpTree, 2> terms_;
140 BufferedZoneList<RegExpTree, 2> text_;
141 BufferedZoneList<RegExpTree, 2> alternatives_;
142#ifdef DEBUG
143 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
144#define LAST(x) last_added_ = x;
145#else
146#define LAST(x)
147#endif
148};
149
150
151class RegExpParser BASE_EMBEDDED {
152 public:
Ben Murdoch109988c2016-05-18 11:27:45 +0100153 RegExpParser(FlatStringReader* in, Handle<String>* error,
154 JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
Ben Murdoch014dc512016-03-22 12:00:34 +0000155
156 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
Ben Murdoch109988c2016-05-18 11:27:45 +0100157 JSRegExp::Flags flags, RegExpCompileData* result);
Ben Murdoch014dc512016-03-22 12:00:34 +0000158
159 RegExpTree* ParsePattern();
160 RegExpTree* ParseDisjunction();
161 RegExpTree* ParseGroup();
162 RegExpTree* ParseCharacterClass();
163
164 // Parses a {...,...} quantifier and stores the range in the given
165 // out parameters.
166 bool ParseIntervalQuantifier(int* min_out, int* max_out);
167
168 // Parses and returns a single escaped character. The character
169 // must not be 'b' or 'B' since they are usually handle specially.
170 uc32 ParseClassCharacterEscape();
171
172 // Checks whether the following is a length-digit hexadecimal number,
173 // and sets the value if it is.
174 bool ParseHexEscape(int length, uc32* value);
175 bool ParseUnicodeEscape(uc32* value);
176 bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
Ben Murdoch109988c2016-05-18 11:27:45 +0100177 ZoneList<CharacterRange>* ParsePropertyClass();
Ben Murdoch014dc512016-03-22 12:00:34 +0000178
179 uc32 ParseOctalLiteral();
180
181 // Tries to parse the input as a back reference. If successful it
182 // stores the result in the output parameter and returns true. If
183 // it fails it will push back the characters read so the same characters
184 // can be reparsed.
185 bool ParseBackReferenceIndex(int* index_out);
186
187 CharacterRange ParseClassAtom(uc16* char_class);
188 RegExpTree* ReportError(Vector<const char> message);
189 void Advance();
190 void Advance(int dist);
191 void Reset(int pos);
192
193 // Reports whether the pattern might be used as a literal search string.
194 // Only use if the result of the parse is a single atom node.
195 bool simple();
196 bool contains_anchor() { return contains_anchor_; }
197 void set_contains_anchor() { contains_anchor_ = true; }
198 int captures_started() { return captures_started_; }
199 int position() { return next_pos_ - 1; }
200 bool failed() { return failed_; }
Ben Murdoch109988c2016-05-18 11:27:45 +0100201 bool ignore_case() const { return ignore_case_; }
202 bool multiline() const { return multiline_; }
203 bool unicode() const { return unicode_; }
Ben Murdoch014dc512016-03-22 12:00:34 +0000204
Ben Murdoch109988c2016-05-18 11:27:45 +0100205 static bool IsSyntaxCharacterOrSlash(uc32 c);
Ben Murdoch014dc512016-03-22 12:00:34 +0000206
207 static const int kMaxCaptures = 1 << 16;
208 static const uc32 kEndMarker = (1 << 21);
209
210 private:
211 enum SubexpressionType {
212 INITIAL,
213 CAPTURE, // All positive values represent captures.
214 POSITIVE_LOOKAROUND,
215 NEGATIVE_LOOKAROUND,
216 GROUPING
217 };
218
219 class RegExpParserState : public ZoneObject {
220 public:
221 RegExpParserState(RegExpParserState* previous_state,
222 SubexpressionType group_type,
223 RegExpLookaround::Type lookaround_type,
Ben Murdoch109988c2016-05-18 11:27:45 +0100224 int disjunction_capture_index, bool ignore_case,
225 bool unicode, Zone* zone)
Ben Murdoch014dc512016-03-22 12:00:34 +0000226 : previous_state_(previous_state),
Ben Murdoch109988c2016-05-18 11:27:45 +0100227 builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
Ben Murdoch014dc512016-03-22 12:00:34 +0000228 group_type_(group_type),
229 lookaround_type_(lookaround_type),
230 disjunction_capture_index_(disjunction_capture_index) {}
231 // Parser state of containing expression, if any.
232 RegExpParserState* previous_state() { return previous_state_; }
233 bool IsSubexpression() { return previous_state_ != NULL; }
234 // RegExpBuilder building this regexp's AST.
235 RegExpBuilder* builder() { return builder_; }
236 // Type of regexp being parsed (parenthesized group or entire regexp).
237 SubexpressionType group_type() { return group_type_; }
238 // Lookahead or Lookbehind.
239 RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
240 // Index in captures array of first capture in this sub-expression, if any.
241 // Also the capture index of this sub-expression itself, if group_type
242 // is CAPTURE.
243 int capture_index() { return disjunction_capture_index_; }
244
245 // Check whether the parser is inside a capture group with the given index.
246 bool IsInsideCaptureGroup(int index);
247
248 private:
249 // Linked list implementation of stack of states.
250 RegExpParserState* previous_state_;
251 // Builder for the stored disjunction.
252 RegExpBuilder* builder_;
253 // Stored disjunction type (capture, look-ahead or grouping), if any.
254 SubexpressionType group_type_;
255 // Stored read direction.
256 RegExpLookaround::Type lookaround_type_;
257 // Stored disjunction's capture index (if any).
258 int disjunction_capture_index_;
259 };
260
261 // Return the 1-indexed RegExpCapture object, allocate if necessary.
262 RegExpCapture* GetCapture(int index);
263
264 Isolate* isolate() { return isolate_; }
265 Zone* zone() const { return zone_; }
266
267 uc32 current() { return current_; }
268 bool has_more() { return has_more_; }
269 bool has_next() { return next_pos_ < in()->length(); }
270 uc32 Next();
Ben Murdoch109988c2016-05-18 11:27:45 +0100271 template <bool update_position>
272 uc32 ReadNext();
Ben Murdoch014dc512016-03-22 12:00:34 +0000273 FlatStringReader* in() { return in_; }
274 void ScanForCaptures();
275
276 Isolate* isolate_;
277 Zone* zone_;
278 Handle<String>* error_;
279 ZoneList<RegExpCapture*>* captures_;
280 FlatStringReader* in_;
281 uc32 current_;
Ben Murdoch109988c2016-05-18 11:27:45 +0100282 bool ignore_case_;
283 bool multiline_;
284 bool unicode_;
Ben Murdoch014dc512016-03-22 12:00:34 +0000285 int next_pos_;
286 int captures_started_;
287 // The capture count is only valid after we have scanned for captures.
288 int capture_count_;
289 bool has_more_;
Ben Murdoch014dc512016-03-22 12:00:34 +0000290 bool simple_;
291 bool contains_anchor_;
292 bool is_scanned_for_captures_;
293 bool failed_;
294};
295
296} // namespace internal
297} // namespace v8
298
299#endif // V8_REGEXP_REGEXP_PARSER_H_