blob: a0b975d79ef457b9c860e4794db367bddb490ea0 [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_PARSER_H_
6#define V8_REGEXP_REGEXP_PARSER_H_
7
8#include "src/objects.h"
9#include "src/regexp/regexp-ast.h"
10#include "src/zone.h"
11
12namespace v8 {
13namespace internal {
14
15struct RegExpCompileData;
16
17
18// A BufferedZoneList is an automatically growing list, just like (and backed
19// by) a ZoneList, that is optimized for the case of adding and removing
20// a single element. The last element added is stored outside the backing list,
21// and if no more than one element is ever added, the ZoneList isn't even
22// allocated.
23// Elements must not be NULL pointers.
24template <typename T, int initial_size>
25class BufferedZoneList {
26 public:
27 BufferedZoneList() : list_(NULL), last_(NULL) {}
28
29 // Adds element at end of list. This element is buffered and can
30 // be read using last() or removed using RemoveLast until a new Add or until
31 // RemoveLast or GetList has been called.
32 void Add(T* value, Zone* zone) {
33 if (last_ != NULL) {
34 if (list_ == NULL) {
35 list_ = new (zone) ZoneList<T*>(initial_size, zone);
36 }
37 list_->Add(last_, zone);
38 }
39 last_ = value;
40 }
41
42 T* last() {
43 DCHECK(last_ != NULL);
44 return last_;
45 }
46
47 T* RemoveLast() {
48 DCHECK(last_ != NULL);
49 T* result = last_;
50 if ((list_ != NULL) && (list_->length() > 0))
51 last_ = list_->RemoveLast();
52 else
53 last_ = NULL;
54 return result;
55 }
56
57 T* Get(int i) {
58 DCHECK((0 <= i) && (i < length()));
59 if (list_ == NULL) {
60 DCHECK_EQ(0, i);
61 return last_;
62 } else {
63 if (i == list_->length()) {
64 DCHECK(last_ != NULL);
65 return last_;
66 } else {
67 return list_->at(i);
68 }
69 }
70 }
71
72 void Clear() {
73 list_ = NULL;
74 last_ = NULL;
75 }
76
77 int length() {
78 int length = (list_ == NULL) ? 0 : list_->length();
79 return length + ((last_ == NULL) ? 0 : 1);
80 }
81
82 ZoneList<T*>* GetList(Zone* zone) {
83 if (list_ == NULL) {
84 list_ = new (zone) ZoneList<T*>(initial_size, zone);
85 }
86 if (last_ != NULL) {
87 list_->Add(last_, zone);
88 last_ = NULL;
89 }
90 return list_;
91 }
92
93 private:
94 ZoneList<T*>* list_;
95 T* last_;
96};
97
98
99// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100class RegExpBuilder : public ZoneObject {
101 public:
Ben Murdoch097c5b22016-05-18 11:27:45 +0100102 RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000103 void AddCharacter(uc16 character);
104 void AddUnicodeCharacter(uc32 character);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100105 void AddEscapedUnicodeCharacter(uc32 character);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000106 // "Adds" an empty expression. Does nothing except consume a
107 // following quantifier
108 void AddEmpty();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100109 void AddCharacterClass(RegExpCharacterClass* cc);
110 void AddCharacterClassForDesugaring(uc32 c);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000111 void AddAtom(RegExpTree* tree);
Ben Murdoch097c5b22016-05-18 11:27:45 +0100112 void AddTerm(RegExpTree* tree);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000113 void AddAssertion(RegExpTree* tree);
114 void NewAlternative(); // '|'
Ben Murdoch097c5b22016-05-18 11:27:45 +0100115 bool AddQuantifierToAtom(int min, int max,
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000116 RegExpQuantifier::QuantifierType type);
117 RegExpTree* ToRegExp();
118
119 private:
Ben Murdoch097c5b22016-05-18 11:27:45 +0100120 static const uc16 kNoPendingSurrogate = 0;
121 void AddLeadSurrogate(uc16 lead_surrogate);
122 void AddTrailSurrogate(uc16 trail_surrogate);
123 void FlushPendingSurrogate();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000124 void FlushCharacters();
125 void FlushText();
126 void FlushTerms();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100127 bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
128 bool NeedsDesugaringForIgnoreCase(uc32 c);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000129 Zone* zone() const { return zone_; }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100130 bool ignore_case() const { return ignore_case_; }
131 bool unicode() const { return unicode_; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000132
133 Zone* zone_;
134 bool pending_empty_;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100135 bool ignore_case_;
136 bool unicode_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000137 ZoneList<uc16>* characters_;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100138 uc16 pending_surrogate_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000139 BufferedZoneList<RegExpTree, 2> terms_;
140 BufferedZoneList<RegExpTree, 2> text_;
141 BufferedZoneList<RegExpTree, 2> alternatives_;
142#ifdef DEBUG
143 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
144#define LAST(x) last_added_ = x;
145#else
146#define LAST(x)
147#endif
148};
149
150
151class RegExpParser BASE_EMBEDDED {
152 public:
Ben Murdoch097c5b22016-05-18 11:27:45 +0100153 RegExpParser(FlatStringReader* in, Handle<String>* error,
154 JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000155
156 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
Ben Murdoch097c5b22016-05-18 11:27:45 +0100157 JSRegExp::Flags flags, RegExpCompileData* result);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000158
159 RegExpTree* ParsePattern();
160 RegExpTree* ParseDisjunction();
161 RegExpTree* ParseGroup();
162 RegExpTree* ParseCharacterClass();
163
164 // Parses a {...,...} quantifier and stores the range in the given
165 // out parameters.
166 bool ParseIntervalQuantifier(int* min_out, int* max_out);
167
168 // Parses and returns a single escaped character. The character
169 // must not be 'b' or 'B' since they are usually handle specially.
170 uc32 ParseClassCharacterEscape();
171
172 // Checks whether the following is a length-digit hexadecimal number,
173 // and sets the value if it is.
174 bool ParseHexEscape(int length, uc32* value);
175 bool ParseUnicodeEscape(uc32* value);
176 bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
Ben Murdoch61f157c2016-09-16 13:49:30 +0100177 bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000178
179 uc32 ParseOctalLiteral();
180
181 // Tries to parse the input as a back reference. If successful it
182 // stores the result in the output parameter and returns true. If
183 // it fails it will push back the characters read so the same characters
184 // can be reparsed.
185 bool ParseBackReferenceIndex(int* index_out);
186
Ben Murdochda12d292016-06-02 14:46:10 +0100187 bool ParseClassProperty(ZoneList<CharacterRange>* result);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000188 CharacterRange ParseClassAtom(uc16* char_class);
189 RegExpTree* ReportError(Vector<const char> message);
190 void Advance();
191 void Advance(int dist);
192 void Reset(int pos);
193
194 // Reports whether the pattern might be used as a literal search string.
195 // Only use if the result of the parse is a single atom node.
196 bool simple();
197 bool contains_anchor() { return contains_anchor_; }
198 void set_contains_anchor() { contains_anchor_ = true; }
199 int captures_started() { return captures_started_; }
200 int position() { return next_pos_ - 1; }
201 bool failed() { return failed_; }
Ben Murdoch097c5b22016-05-18 11:27:45 +0100202 bool ignore_case() const { return ignore_case_; }
203 bool multiline() const { return multiline_; }
204 bool unicode() const { return unicode_; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000205
Ben Murdoch097c5b22016-05-18 11:27:45 +0100206 static bool IsSyntaxCharacterOrSlash(uc32 c);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000207
208 static const int kMaxCaptures = 1 << 16;
209 static const uc32 kEndMarker = (1 << 21);
210
211 private:
212 enum SubexpressionType {
213 INITIAL,
214 CAPTURE, // All positive values represent captures.
215 POSITIVE_LOOKAROUND,
216 NEGATIVE_LOOKAROUND,
217 GROUPING
218 };
219
220 class RegExpParserState : public ZoneObject {
221 public:
222 RegExpParserState(RegExpParserState* previous_state,
223 SubexpressionType group_type,
224 RegExpLookaround::Type lookaround_type,
Ben Murdoch61f157c2016-09-16 13:49:30 +0100225 int disjunction_capture_index,
226 const ZoneVector<uc16>* capture_name, bool ignore_case,
Ben Murdoch097c5b22016-05-18 11:27:45 +0100227 bool unicode, Zone* zone)
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000228 : previous_state_(previous_state),
Ben Murdoch097c5b22016-05-18 11:27:45 +0100229 builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000230 group_type_(group_type),
231 lookaround_type_(lookaround_type),
Ben Murdoch61f157c2016-09-16 13:49:30 +0100232 disjunction_capture_index_(disjunction_capture_index),
233 capture_name_(capture_name) {}
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000234 // Parser state of containing expression, if any.
235 RegExpParserState* previous_state() { return previous_state_; }
236 bool IsSubexpression() { return previous_state_ != NULL; }
237 // RegExpBuilder building this regexp's AST.
238 RegExpBuilder* builder() { return builder_; }
239 // Type of regexp being parsed (parenthesized group or entire regexp).
240 SubexpressionType group_type() { return group_type_; }
241 // Lookahead or Lookbehind.
242 RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
243 // Index in captures array of first capture in this sub-expression, if any.
244 // Also the capture index of this sub-expression itself, if group_type
245 // is CAPTURE.
246 int capture_index() { return disjunction_capture_index_; }
Ben Murdoch61f157c2016-09-16 13:49:30 +0100247 // The name of the current sub-expression, if group_type is CAPTURE. Only
248 // used for named captures.
249 const ZoneVector<uc16>* capture_name() { return capture_name_; }
250
251 bool IsNamedCapture() const { return capture_name_ != nullptr; }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000252
253 // Check whether the parser is inside a capture group with the given index.
254 bool IsInsideCaptureGroup(int index);
Ben Murdoch61f157c2016-09-16 13:49:30 +0100255 // Check whether the parser is inside a capture group with the given name.
256 bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000257
258 private:
259 // Linked list implementation of stack of states.
260 RegExpParserState* previous_state_;
261 // Builder for the stored disjunction.
262 RegExpBuilder* builder_;
263 // Stored disjunction type (capture, look-ahead or grouping), if any.
264 SubexpressionType group_type_;
265 // Stored read direction.
266 RegExpLookaround::Type lookaround_type_;
267 // Stored disjunction's capture index (if any).
268 int disjunction_capture_index_;
Ben Murdoch61f157c2016-09-16 13:49:30 +0100269 // Stored capture name (if any).
270 const ZoneVector<uc16>* capture_name_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000271 };
272
273 // Return the 1-indexed RegExpCapture object, allocate if necessary.
274 RegExpCapture* GetCapture(int index);
275
Ben Murdoch61f157c2016-09-16 13:49:30 +0100276 // Creates a new named capture at the specified index. Must be called exactly
277 // once for each named capture. Fails if a capture with the same name is
278 // encountered.
279 bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
280
281 // Parses the name of a capture group (?<name>pattern). The name must adhere
282 // to IdentifierName in the ECMAScript standard.
283 const ZoneVector<uc16>* ParseCaptureGroupName();
284
285 bool ParseNamedBackReference(RegExpBuilder* builder,
286 RegExpParserState* state);
287
288 // After the initial parsing pass, patch corresponding RegExpCapture objects
289 // into all RegExpBackReferences. This is done after initial parsing in order
290 // to avoid complicating cases in which references comes before the capture.
291 void PatchNamedBackReferences();
292
293 Handle<FixedArray> CreateCaptureNameMap();
294
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000295 Isolate* isolate() { return isolate_; }
296 Zone* zone() const { return zone_; }
297
298 uc32 current() { return current_; }
299 bool has_more() { return has_more_; }
300 bool has_next() { return next_pos_ < in()->length(); }
301 uc32 Next();
Ben Murdoch097c5b22016-05-18 11:27:45 +0100302 template <bool update_position>
303 uc32 ReadNext();
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000304 FlatStringReader* in() { return in_; }
305 void ScanForCaptures();
306
307 Isolate* isolate_;
308 Zone* zone_;
309 Handle<String>* error_;
310 ZoneList<RegExpCapture*>* captures_;
Ben Murdoch61f157c2016-09-16 13:49:30 +0100311 ZoneList<RegExpCapture*>* named_captures_;
312 ZoneList<RegExpBackReference*>* named_back_references_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000313 FlatStringReader* in_;
314 uc32 current_;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100315 bool ignore_case_;
316 bool multiline_;
317 bool unicode_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000318 int next_pos_;
319 int captures_started_;
320 // The capture count is only valid after we have scanned for captures.
321 int capture_count_;
322 bool has_more_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000323 bool simple_;
324 bool contains_anchor_;
325 bool is_scanned_for_captures_;
326 bool failed_;
327};
328
329} // namespace internal
330} // namespace v8
331
332#endif // V8_REGEXP_REGEXP_PARSER_H_