blob: 8d6184697fa58aed313d22aad77dacf1efb6667a [file] [log] [blame]
Steve Blocka7e24c12009-10-30 11:49:00 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_SCANNER_H_
29#define V8_SCANNER_H_
30
31#include "token.h"
32#include "char-predicates-inl.h"
33
34namespace v8 {
35namespace internal {
36
37
38class UTF8Buffer {
39 public:
40 UTF8Buffer();
41 ~UTF8Buffer();
42
Kristian Monsen80d68ea2010-09-08 11:05:35 +010043 inline void AddChar(uc32 c) {
44 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
45 buffer_.Add(static_cast<char>(c));
Steve Blocka7e24c12009-10-30 11:49:00 +000046 } else {
47 AddCharSlow(c);
48 }
49 }
50
Kristian Monsen80d68ea2010-09-08 11:05:35 +010051 void StartLiteral() {
52 buffer_.StartSequence();
53 }
54
55 Vector<const char> EndLiteral() {
56 buffer_.Add(kEndMarker);
57 Vector<char> sequence = buffer_.EndSequence();
58 return Vector<const char>(sequence.start(), sequence.length());
59 }
60
61 void DropLiteral() {
62 buffer_.DropSequence();
63 }
64
Steve Blockd0582a62009-12-15 09:54:21 +000065 void Reset() {
Kristian Monsen80d68ea2010-09-08 11:05:35 +010066 buffer_.Reset();
Steve Blockd0582a62009-12-15 09:54:21 +000067 }
68
Kristian Monsen80d68ea2010-09-08 11:05:35 +010069 // The end marker added after a parsed literal.
70 // Using zero allows the usage of strlen and similar functions on
71 // identifiers and numbers (but not strings, since they may contain zero
72 // bytes).
73 // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside
74 // an utf-8 string. This requires changes in all places that uses
75 // str-functions on the literals, but allows a single pointer to represent
76 // the literal, even if it contains embedded zeros.
77 static const char kEndMarker = '\x00';
Steve Blocka7e24c12009-10-30 11:49:00 +000078 private:
Steve Blockd0582a62009-12-15 09:54:21 +000079 static const int kInitialCapacity = 256;
Kristian Monsen80d68ea2010-09-08 11:05:35 +010080 SequenceCollector<char, 4> buffer_;
Steve Blocka7e24c12009-10-30 11:49:00 +000081
82 void AddCharSlow(uc32 c);
83};
84
85
Steve Block6ded16b2010-05-10 14:33:55 +010086// Interface through which the scanner reads characters from the input source.
Steve Blocka7e24c12009-10-30 11:49:00 +000087class UTF16Buffer {
88 public:
89 UTF16Buffer();
90 virtual ~UTF16Buffer() {}
91
92 virtual void PushBack(uc32 ch) = 0;
Steve Block6ded16b2010-05-10 14:33:55 +010093 // Returns a value < 0 when the buffer end is reached.
Steve Blocka7e24c12009-10-30 11:49:00 +000094 virtual uc32 Advance() = 0;
95 virtual void SeekForward(int pos) = 0;
96
97 int pos() const { return pos_; }
Steve Blocka7e24c12009-10-30 11:49:00 +000098
99 protected:
Steve Block6ded16b2010-05-10 14:33:55 +0100100 int pos_; // Current position in the buffer.
101 int end_; // Position where scanning should stop (EOF).
Steve Blocka7e24c12009-10-30 11:49:00 +0000102};
103
104
Steve Block6ded16b2010-05-10 14:33:55 +0100105// UTF16 buffer to read characters from a character stream.
Steve Blocka7e24c12009-10-30 11:49:00 +0000106class CharacterStreamUTF16Buffer: public UTF16Buffer {
107 public:
108 CharacterStreamUTF16Buffer();
109 virtual ~CharacterStreamUTF16Buffer() {}
Steve Block6ded16b2010-05-10 14:33:55 +0100110 void Initialize(Handle<String> data,
111 unibrow::CharacterStream* stream,
112 int start_position,
113 int end_position);
Steve Blocka7e24c12009-10-30 11:49:00 +0000114 virtual void PushBack(uc32 ch);
115 virtual uc32 Advance();
116 virtual void SeekForward(int pos);
117
118 private:
119 List<uc32> pushback_buffer_;
120 uc32 last_;
121 unibrow::CharacterStream* stream_;
122
123 List<uc32>* pushback_buffer() { return &pushback_buffer_; }
124};
125
126
Steve Block6ded16b2010-05-10 14:33:55 +0100127// UTF16 buffer to read characters from an external string.
128template <typename StringType, typename CharType>
129class ExternalStringUTF16Buffer: public UTF16Buffer {
Steve Blocka7e24c12009-10-30 11:49:00 +0000130 public:
Steve Block6ded16b2010-05-10 14:33:55 +0100131 ExternalStringUTF16Buffer();
132 virtual ~ExternalStringUTF16Buffer() {}
133 void Initialize(Handle<StringType> data,
134 int start_position,
135 int end_position);
Steve Blocka7e24c12009-10-30 11:49:00 +0000136 virtual void PushBack(uc32 ch);
137 virtual uc32 Advance();
138 virtual void SeekForward(int pos);
139
140 private:
Steve Block6ded16b2010-05-10 14:33:55 +0100141 const CharType* raw_data_; // Pointer to the actual array of characters.
Steve Blocka7e24c12009-10-30 11:49:00 +0000142};
143
144
Steve Blockd0582a62009-12-15 09:54:21 +0000145class KeywordMatcher {
146// Incrementally recognize keywords.
147//
148// Recognized keywords:
149// break case catch const* continue debugger* default delete do else
150// finally false for function if in instanceof native* new null
151// return switch this throw true try typeof var void while with
152//
153// *: Actually "future reserved keywords". These are the only ones we
154// recognized, the remaining are allowed as identifiers.
155 public:
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100156 KeywordMatcher()
157 : state_(INITIAL),
158 token_(Token::IDENTIFIER),
159 keyword_(NULL),
160 counter_(0),
161 keyword_token_(Token::ILLEGAL) {}
Steve Blockd0582a62009-12-15 09:54:21 +0000162
163 Token::Value token() { return token_; }
164
165 inline void AddChar(uc32 input) {
166 if (state_ != UNMATCHABLE) {
167 Step(input);
168 }
169 }
170
171 void Fail() {
172 token_ = Token::IDENTIFIER;
173 state_ = UNMATCHABLE;
174 }
175
176 private:
177 enum State {
178 UNMATCHABLE,
179 INITIAL,
180 KEYWORD_PREFIX,
181 KEYWORD_MATCHED,
182 C,
183 CA,
184 CO,
185 CON,
186 D,
187 DE,
188 F,
189 I,
190 IN,
191 N,
192 T,
193 TH,
194 TR,
195 V,
196 W
197 };
198
199 struct FirstState {
200 const char* keyword;
201 State state;
202 Token::Value token;
203 };
204
205 // Range of possible first characters of a keyword.
206 static const unsigned int kFirstCharRangeMin = 'b';
207 static const unsigned int kFirstCharRangeMax = 'w';
208 static const unsigned int kFirstCharRangeLength =
209 kFirstCharRangeMax - kFirstCharRangeMin + 1;
210 // State map for first keyword character range.
211 static FirstState first_states_[kFirstCharRangeLength];
212
Steve Blockd0582a62009-12-15 09:54:21 +0000213 // If input equals keyword's character at position, continue matching keyword
214 // from that position.
215 inline bool MatchKeywordStart(uc32 input,
216 const char* keyword,
217 int position,
218 Token::Value token_if_match) {
219 if (input == keyword[position]) {
220 state_ = KEYWORD_PREFIX;
221 this->keyword_ = keyword;
222 this->counter_ = position + 1;
223 this->keyword_token_ = token_if_match;
224 return true;
225 }
226 return false;
227 }
228
229 // If input equals match character, transition to new state and return true.
230 inline bool MatchState(uc32 input, char match, State new_state) {
231 if (input == match) {
232 state_ = new_state;
233 return true;
234 }
235 return false;
236 }
237
238 inline bool MatchKeyword(uc32 input,
239 char match,
240 State new_state,
241 Token::Value keyword_token) {
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100242 if (input != match) {
243 return false;
Steve Blockd0582a62009-12-15 09:54:21 +0000244 }
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100245 state_ = new_state;
246 token_ = keyword_token;
247 return true;
Steve Blockd0582a62009-12-15 09:54:21 +0000248 }
249
250 void Step(uc32 input);
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100251
252 // Current state.
253 State state_;
254 // Token for currently added characters.
255 Token::Value token_;
256
257 // Matching a specific keyword string (there is only one possible valid
258 // keyword with the current prefix).
259 const char* keyword_;
260 int counter_;
261 Token::Value keyword_token_;
Steve Blockd0582a62009-12-15 09:54:21 +0000262};
263
264
Leon Clarke4515c472010-02-03 11:58:03 +0000265enum ParserMode { PARSE, PREPARSE };
266enum ParserLanguage { JAVASCRIPT, JSON };
267
268
Steve Blocka7e24c12009-10-30 11:49:00 +0000269class Scanner {
270 public:
Steve Blocka7e24c12009-10-30 11:49:00 +0000271 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
272
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100273 class LiteralScope {
274 public:
275 explicit LiteralScope(Scanner* self);
276 ~LiteralScope();
277 void Complete();
278
279 private:
280 Scanner* scanner_;
281 bool complete_;
282 };
283
Steve Blocka7e24c12009-10-30 11:49:00 +0000284 // Construction
Leon Clarke4515c472010-02-03 11:58:03 +0000285 explicit Scanner(ParserMode parse_mode);
Steve Blocka7e24c12009-10-30 11:49:00 +0000286
Steve Block6ded16b2010-05-10 14:33:55 +0100287 // Initialize the Scanner to scan source.
288 void Initialize(Handle<String> source,
289 ParserLanguage language);
290 void Initialize(Handle<String> source,
291 unibrow::CharacterStream* stream,
292 ParserLanguage language);
293 void Initialize(Handle<String> source,
294 int start_position, int end_position,
295 ParserLanguage language);
Steve Blocka7e24c12009-10-30 11:49:00 +0000296
297 // Returns the next token.
298 Token::Value Next();
299
300 // One token look-ahead (past the token returned by Next()).
301 Token::Value peek() const { return next_.token; }
302
303 // Returns true if there was a line terminator before the peek'ed token.
304 bool has_line_terminator_before_next() const {
305 return has_line_terminator_before_next_;
306 }
307
308 struct Location {
309 Location(int b, int e) : beg_pos(b), end_pos(e) { }
310 Location() : beg_pos(0), end_pos(0) { }
311 int beg_pos;
312 int end_pos;
313 };
314
315 // Returns the location information for the current token
316 // (the token returned by Next()).
317 Location location() const { return current_.location; }
318 Location peek_location() const { return next_.location; }
319
320 // Returns the literal string, if any, for the current token (the
321 // token returned by Next()). The string is 0-terminated and in
322 // UTF-8 format; they may contain 0-characters. Literal strings are
323 // collected for identifiers, strings, and numbers.
Steve Blockd0582a62009-12-15 09:54:21 +0000324 // These functions only give the correct result if the literal
325 // was scanned between calls to StartLiteral() and TerminateLiteral().
Steve Blocka7e24c12009-10-30 11:49:00 +0000326 const char* literal_string() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100327 return current_.literal_chars.start();
Steve Blocka7e24c12009-10-30 11:49:00 +0000328 }
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100329
Steve Blocka7e24c12009-10-30 11:49:00 +0000330 int literal_length() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100331 // Excluding terminal '\x00' added by TerminateLiteral().
332 return current_.literal_chars.length() - 1;
333 }
334
335 Vector<const char> literal() const {
336 return Vector<const char>(literal_string(), literal_length());
Steve Blocka7e24c12009-10-30 11:49:00 +0000337 }
338
339 // Returns the literal string for the next token (the token that
340 // would be returned if Next() were called).
341 const char* next_literal_string() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100342 return next_.literal_chars.start();
Steve Blocka7e24c12009-10-30 11:49:00 +0000343 }
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100344
345
Steve Blocka7e24c12009-10-30 11:49:00 +0000346 // Returns the length of the next token (that would be returned if
347 // Next() were called).
348 int next_literal_length() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100349 // Excluding terminal '\x00' added by TerminateLiteral().
350 return next_.literal_chars.length() - 1;
Steve Blockd0582a62009-12-15 09:54:21 +0000351 }
352
353 Vector<const char> next_literal() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100354 return Vector<const char>(next_literal_string(), next_literal_length());
Steve Blocka7e24c12009-10-30 11:49:00 +0000355 }
356
357 // Scans the input as a regular expression pattern, previous
358 // character(s) must be /(=). Returns true if a pattern is scanned.
359 bool ScanRegExpPattern(bool seen_equal);
360 // Returns true if regexp flags are scanned (always since flags can
361 // be empty).
362 bool ScanRegExpFlags();
363
364 // Seek forward to the given position. This operation does not
365 // work in general, for instance when there are pushed back
366 // characters, but works for seeking forward until simple delimiter
367 // tokens, which is what it is used for.
368 void SeekForward(int pos);
369
Steve Blocka7e24c12009-10-30 11:49:00 +0000370 bool stack_overflow() { return stack_overflow_; }
371
372 static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
373
374 // Tells whether the buffer contains an identifier (no escapes).
375 // Used for checking if a property name is an identifier.
376 static bool IsIdentifier(unibrow::CharacterStream* buffer);
377
378 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
379 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
380 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
381 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
382
383 static const int kCharacterLookaheadBufferSize = 1;
Steve Block6ded16b2010-05-10 14:33:55 +0100384 static const int kNoEndPosition = 1;
Steve Blocka7e24c12009-10-30 11:49:00 +0000385
386 private:
Steve Blocka7e24c12009-10-30 11:49:00 +0000387 // The current and look-ahead token.
388 struct TokenDesc {
389 Token::Value token;
390 Location location;
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100391 Vector<const char> literal_chars;
Steve Blocka7e24c12009-10-30 11:49:00 +0000392 };
393
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100394 void Init(Handle<String> source,
395 unibrow::CharacterStream* stream,
396 int start_position, int end_position,
397 ParserLanguage language);
Steve Blocka7e24c12009-10-30 11:49:00 +0000398
399 // Literal buffer support
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100400 inline void StartLiteral();
401 inline void AddChar(uc32 ch);
402 inline void AddCharAdvance();
403 inline void TerminateLiteral();
404 // Stops scanning of a literal, e.g., due to an encountered error.
405 inline void DropLiteral();
Steve Blocka7e24c12009-10-30 11:49:00 +0000406
407 // Low-level scanning support.
408 void Advance() { c0_ = source_->Advance(); }
409 void PushBack(uc32 ch) {
410 source_->PushBack(ch);
411 c0_ = ch;
412 }
413
Leon Clarke4515c472010-02-03 11:58:03 +0000414 bool SkipWhiteSpace() {
415 if (is_parsing_json_) {
416 return SkipJsonWhiteSpace();
417 } else {
418 return SkipJavaScriptWhiteSpace();
419 }
420 }
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100421
Leon Clarke4515c472010-02-03 11:58:03 +0000422 bool SkipJavaScriptWhiteSpace();
423 bool SkipJsonWhiteSpace();
Steve Blocka7e24c12009-10-30 11:49:00 +0000424 Token::Value SkipSingleLineComment();
425 Token::Value SkipMultiLineComment();
426
427 inline Token::Value Select(Token::Value tok);
428 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
429
Leon Clarke4515c472010-02-03 11:58:03 +0000430 inline void Scan() {
431 if (is_parsing_json_) {
432 ScanJson();
433 } else {
434 ScanJavaScript();
435 }
436 }
437
438 // Scans a single JavaScript token.
439 void ScanJavaScript();
440
441 // Scan a single JSON token. The JSON lexical grammar is specified in the
442 // ECMAScript 5 standard, section 15.12.1.1.
443 // Recognizes all of the single-character tokens directly, or calls a function
444 // to scan a number, string or identifier literal.
445 // The only allowed whitespace characters between tokens are tab,
446 // carrige-return, newline and space.
447 void ScanJson();
448
449 // A JSON number (production JSONNumber) is a subset of the valid JavaScript
450 // decimal number literals.
451 // It includes an optional minus sign, must have at least one
452 // digit before and after a decimal point, may not have prefixed zeros (unless
453 // the integer part is zero), and may include an exponent part (e.g., "e-10").
454 // Hexadecimal and octal numbers are not allowed.
455 Token::Value ScanJsonNumber();
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100456
Leon Clarke4515c472010-02-03 11:58:03 +0000457 // A JSON string (production JSONString) is subset of valid JavaScript string
458 // literals. The string must only be double-quoted (not single-quoted), and
459 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
460 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
461 Token::Value ScanJsonString();
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100462
Leon Clarke4515c472010-02-03 11:58:03 +0000463 // Used to recognizes one of the literals "true", "false", or "null". These
464 // are the only valid JSON identifiers (productions JSONBooleanLiteral,
465 // JSONNullLiteral).
466 Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
467
Steve Blocka7e24c12009-10-30 11:49:00 +0000468 void ScanDecimalDigits();
469 Token::Value ScanNumber(bool seen_period);
470 Token::Value ScanIdentifier();
471 uc32 ScanHexEscape(uc32 c, int length);
472 uc32 ScanOctalEscape(uc32 c, int length);
473 void ScanEscape();
474 Token::Value ScanString();
475
476 // Scans a possible HTML comment -- begins with '<!'.
477 Token::Value ScanHtmlComment();
478
479 // Return the current source position.
480 int source_pos() {
Steve Block6ded16b2010-05-10 14:33:55 +0100481 return source_->pos() - kCharacterLookaheadBufferSize;
Steve Blocka7e24c12009-10-30 11:49:00 +0000482 }
483
484 // Decodes a unicode escape-sequence which is part of an identifier.
485 // If the escape sequence cannot be decoded the result is kBadRune.
486 uc32 ScanIdentifierUnicodeEscape();
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100487
488 TokenDesc current_; // desc for current token (as returned by Next())
489 TokenDesc next_; // desc for next token (one token look-ahead)
490 bool has_line_terminator_before_next_;
491 bool is_pre_parsing_;
492 bool is_parsing_json_;
493
494 // Different UTF16 buffers used to pull characters from. Based on input one of
495 // these will be initialized as the actual data source.
496 CharacterStreamUTF16Buffer char_stream_buffer_;
497 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
498 two_byte_string_buffer_;
499 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
500
501 // Source. Will point to one of the buffers declared above.
502 UTF16Buffer* source_;
503
504 // Used to convert the source string into a character stream when a stream
505 // is not passed to the scanner.
506 SafeStringInputBuffer safe_string_input_buffer_;
507
508 // Buffer to hold literal values (identifiers, strings, numbers)
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100509 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
510 UTF8Buffer literal_buffer_;
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100511
512 bool stack_overflow_;
513 static StaticResource<Utf8Decoder> utf8_decoder_;
514
515 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
516 uc32 c0_;
Steve Blocka7e24c12009-10-30 11:49:00 +0000517};
518
519} } // namespace v8::internal
520
521#endif // V8_SCANNER_H_