blob: af9b765fba16b0de0f00d814da565a8ebd993ec1 [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_PARSER_H_
6#define V8_REGEXP_REGEXP_PARSER_H_
7
8#include "src/objects.h"
9#include "src/regexp/regexp-ast.h"
10#include "src/zone.h"
11
12namespace v8 {
13namespace internal {
14
15struct RegExpCompileData;
16
17
18// A BufferedZoneList is an automatically growing list, just like (and backed
19// by) a ZoneList, that is optimized for the case of adding and removing
20// a single element. The last element added is stored outside the backing list,
21// and if no more than one element is ever added, the ZoneList isn't even
22// allocated.
23// Elements must not be NULL pointers.
24template <typename T, int initial_size>
25class BufferedZoneList {
26 public:
27 BufferedZoneList() : list_(NULL), last_(NULL) {}
28
29 // Adds element at end of list. This element is buffered and can
30 // be read using last() or removed using RemoveLast until a new Add or until
31 // RemoveLast or GetList has been called.
32 void Add(T* value, Zone* zone) {
33 if (last_ != NULL) {
34 if (list_ == NULL) {
35 list_ = new (zone) ZoneList<T*>(initial_size, zone);
36 }
37 list_->Add(last_, zone);
38 }
39 last_ = value;
40 }
41
42 T* last() {
43 DCHECK(last_ != NULL);
44 return last_;
45 }
46
47 T* RemoveLast() {
48 DCHECK(last_ != NULL);
49 T* result = last_;
50 if ((list_ != NULL) && (list_->length() > 0))
51 last_ = list_->RemoveLast();
52 else
53 last_ = NULL;
54 return result;
55 }
56
57 T* Get(int i) {
58 DCHECK((0 <= i) && (i < length()));
59 if (list_ == NULL) {
60 DCHECK_EQ(0, i);
61 return last_;
62 } else {
63 if (i == list_->length()) {
64 DCHECK(last_ != NULL);
65 return last_;
66 } else {
67 return list_->at(i);
68 }
69 }
70 }
71
72 void Clear() {
73 list_ = NULL;
74 last_ = NULL;
75 }
76
77 int length() {
78 int length = (list_ == NULL) ? 0 : list_->length();
79 return length + ((last_ == NULL) ? 0 : 1);
80 }
81
82 ZoneList<T*>* GetList(Zone* zone) {
83 if (list_ == NULL) {
84 list_ = new (zone) ZoneList<T*>(initial_size, zone);
85 }
86 if (last_ != NULL) {
87 list_->Add(last_, zone);
88 last_ = NULL;
89 }
90 return list_;
91 }
92
93 private:
94 ZoneList<T*>* list_;
95 T* last_;
96};
97
98
99// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100class RegExpBuilder : public ZoneObject {
101 public:
102 explicit RegExpBuilder(Zone* zone);
103 void AddCharacter(uc16 character);
104 void AddUnicodeCharacter(uc32 character);
105 // "Adds" an empty expression. Does nothing except consume a
106 // following quantifier
107 void AddEmpty();
108 void AddAtom(RegExpTree* tree);
109 void AddAssertion(RegExpTree* tree);
110 void NewAlternative(); // '|'
111 void AddQuantifierToAtom(int min, int max,
112 RegExpQuantifier::QuantifierType type);
113 RegExpTree* ToRegExp();
114
115 private:
116 void FlushCharacters();
117 void FlushText();
118 void FlushTerms();
119 Zone* zone() const { return zone_; }
120
121 Zone* zone_;
122 bool pending_empty_;
123 ZoneList<uc16>* characters_;
124 BufferedZoneList<RegExpTree, 2> terms_;
125 BufferedZoneList<RegExpTree, 2> text_;
126 BufferedZoneList<RegExpTree, 2> alternatives_;
127#ifdef DEBUG
128 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
129#define LAST(x) last_added_ = x;
130#else
131#define LAST(x)
132#endif
133};
134
135
136class RegExpParser BASE_EMBEDDED {
137 public:
138 RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode,
139 bool unicode, Isolate* isolate, Zone* zone);
140
141 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
142 bool multiline, bool unicode,
143 RegExpCompileData* result);
144
145 RegExpTree* ParsePattern();
146 RegExpTree* ParseDisjunction();
147 RegExpTree* ParseGroup();
148 RegExpTree* ParseCharacterClass();
149
150 // Parses a {...,...} quantifier and stores the range in the given
151 // out parameters.
152 bool ParseIntervalQuantifier(int* min_out, int* max_out);
153
154 // Parses and returns a single escaped character. The character
155 // must not be 'b' or 'B' since they are usually handle specially.
156 uc32 ParseClassCharacterEscape();
157
158 // Checks whether the following is a length-digit hexadecimal number,
159 // and sets the value if it is.
160 bool ParseHexEscape(int length, uc32* value);
161 bool ParseUnicodeEscape(uc32* value);
162 bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
163
164 uc32 ParseOctalLiteral();
165
166 // Tries to parse the input as a back reference. If successful it
167 // stores the result in the output parameter and returns true. If
168 // it fails it will push back the characters read so the same characters
169 // can be reparsed.
170 bool ParseBackReferenceIndex(int* index_out);
171
172 CharacterRange ParseClassAtom(uc16* char_class);
173 RegExpTree* ReportError(Vector<const char> message);
174 void Advance();
175 void Advance(int dist);
176 void Reset(int pos);
177
178 // Reports whether the pattern might be used as a literal search string.
179 // Only use if the result of the parse is a single atom node.
180 bool simple();
181 bool contains_anchor() { return contains_anchor_; }
182 void set_contains_anchor() { contains_anchor_ = true; }
183 int captures_started() { return captures_started_; }
184 int position() { return next_pos_ - 1; }
185 bool failed() { return failed_; }
186
187 static bool IsSyntaxCharacter(uc32 c);
188
189 static const int kMaxCaptures = 1 << 16;
190 static const uc32 kEndMarker = (1 << 21);
191
192 private:
193 enum SubexpressionType {
194 INITIAL,
195 CAPTURE, // All positive values represent captures.
196 POSITIVE_LOOKAROUND,
197 NEGATIVE_LOOKAROUND,
198 GROUPING
199 };
200
201 class RegExpParserState : public ZoneObject {
202 public:
203 RegExpParserState(RegExpParserState* previous_state,
204 SubexpressionType group_type,
205 RegExpLookaround::Type lookaround_type,
206 int disjunction_capture_index, Zone* zone)
207 : previous_state_(previous_state),
208 builder_(new (zone) RegExpBuilder(zone)),
209 group_type_(group_type),
210 lookaround_type_(lookaround_type),
211 disjunction_capture_index_(disjunction_capture_index) {}
212 // Parser state of containing expression, if any.
213 RegExpParserState* previous_state() { return previous_state_; }
214 bool IsSubexpression() { return previous_state_ != NULL; }
215 // RegExpBuilder building this regexp's AST.
216 RegExpBuilder* builder() { return builder_; }
217 // Type of regexp being parsed (parenthesized group or entire regexp).
218 SubexpressionType group_type() { return group_type_; }
219 // Lookahead or Lookbehind.
220 RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
221 // Index in captures array of first capture in this sub-expression, if any.
222 // Also the capture index of this sub-expression itself, if group_type
223 // is CAPTURE.
224 int capture_index() { return disjunction_capture_index_; }
225
226 // Check whether the parser is inside a capture group with the given index.
227 bool IsInsideCaptureGroup(int index);
228
229 private:
230 // Linked list implementation of stack of states.
231 RegExpParserState* previous_state_;
232 // Builder for the stored disjunction.
233 RegExpBuilder* builder_;
234 // Stored disjunction type (capture, look-ahead or grouping), if any.
235 SubexpressionType group_type_;
236 // Stored read direction.
237 RegExpLookaround::Type lookaround_type_;
238 // Stored disjunction's capture index (if any).
239 int disjunction_capture_index_;
240 };
241
242 // Return the 1-indexed RegExpCapture object, allocate if necessary.
243 RegExpCapture* GetCapture(int index);
244
245 Isolate* isolate() { return isolate_; }
246 Zone* zone() const { return zone_; }
247
248 uc32 current() { return current_; }
249 bool has_more() { return has_more_; }
250 bool has_next() { return next_pos_ < in()->length(); }
251 uc32 Next();
252 FlatStringReader* in() { return in_; }
253 void ScanForCaptures();
254
255 Isolate* isolate_;
256 Zone* zone_;
257 Handle<String>* error_;
258 ZoneList<RegExpCapture*>* captures_;
259 FlatStringReader* in_;
260 uc32 current_;
261 int next_pos_;
262 int captures_started_;
263 // The capture count is only valid after we have scanned for captures.
264 int capture_count_;
265 bool has_more_;
266 bool multiline_;
267 bool unicode_;
268 bool simple_;
269 bool contains_anchor_;
270 bool is_scanned_for_captures_;
271 bool failed_;
272};
273
274} // namespace internal
275} // namespace v8
276
277#endif // V8_REGEXP_REGEXP_PARSER_H_