blob: 6e5333bce1e5defe347d3f4cac9e3fa51be4e382 [file] [log] [blame]
Kristian Monsen0d5e1162010-09-30 15:31:59 +01001// Copyright 2010 the V8 project authors. All rights reserved.
Steve Blocka7e24c12009-10-30 11:49:00 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_SCANNER_H_
29#define V8_SCANNER_H_
30
31#include "token.h"
32#include "char-predicates-inl.h"
33
34namespace v8 {
35namespace internal {
36
37
38class UTF8Buffer {
39 public:
40 UTF8Buffer();
41 ~UTF8Buffer();
42
Kristian Monsen80d68ea2010-09-08 11:05:35 +010043 inline void AddChar(uc32 c) {
44 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
45 buffer_.Add(static_cast<char>(c));
Steve Blocka7e24c12009-10-30 11:49:00 +000046 } else {
47 AddCharSlow(c);
48 }
49 }
50
Kristian Monsen80d68ea2010-09-08 11:05:35 +010051 void StartLiteral() {
52 buffer_.StartSequence();
53 }
54
55 Vector<const char> EndLiteral() {
56 buffer_.Add(kEndMarker);
57 Vector<char> sequence = buffer_.EndSequence();
58 return Vector<const char>(sequence.start(), sequence.length());
59 }
60
61 void DropLiteral() {
62 buffer_.DropSequence();
63 }
64
Steve Blockd0582a62009-12-15 09:54:21 +000065 void Reset() {
Kristian Monsen80d68ea2010-09-08 11:05:35 +010066 buffer_.Reset();
Steve Blockd0582a62009-12-15 09:54:21 +000067 }
68
Kristian Monsen80d68ea2010-09-08 11:05:35 +010069 // The end marker added after a parsed literal.
70 // Using zero allows the usage of strlen and similar functions on
71 // identifiers and numbers (but not strings, since they may contain zero
72 // bytes).
73 // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside
74 // an utf-8 string. This requires changes in all places that uses
75 // str-functions on the literals, but allows a single pointer to represent
76 // the literal, even if it contains embedded zeros.
77 static const char kEndMarker = '\x00';
Steve Blocka7e24c12009-10-30 11:49:00 +000078 private:
Steve Blockd0582a62009-12-15 09:54:21 +000079 static const int kInitialCapacity = 256;
Kristian Monsen80d68ea2010-09-08 11:05:35 +010080 SequenceCollector<char, 4> buffer_;
Steve Blocka7e24c12009-10-30 11:49:00 +000081
82 void AddCharSlow(uc32 c);
83};
84
85
Steve Block6ded16b2010-05-10 14:33:55 +010086// Interface through which the scanner reads characters from the input source.
Steve Blocka7e24c12009-10-30 11:49:00 +000087class UTF16Buffer {
88 public:
89 UTF16Buffer();
90 virtual ~UTF16Buffer() {}
91
92 virtual void PushBack(uc32 ch) = 0;
Steve Block6ded16b2010-05-10 14:33:55 +010093 // Returns a value < 0 when the buffer end is reached.
Steve Blocka7e24c12009-10-30 11:49:00 +000094 virtual uc32 Advance() = 0;
95 virtual void SeekForward(int pos) = 0;
96
97 int pos() const { return pos_; }
Steve Blocka7e24c12009-10-30 11:49:00 +000098
99 protected:
Steve Block6ded16b2010-05-10 14:33:55 +0100100 int pos_; // Current position in the buffer.
101 int end_; // Position where scanning should stop (EOF).
Steve Blocka7e24c12009-10-30 11:49:00 +0000102};
103
104
Steve Block6ded16b2010-05-10 14:33:55 +0100105// UTF16 buffer to read characters from a character stream.
Steve Blocka7e24c12009-10-30 11:49:00 +0000106class CharacterStreamUTF16Buffer: public UTF16Buffer {
107 public:
108 CharacterStreamUTF16Buffer();
109 virtual ~CharacterStreamUTF16Buffer() {}
Steve Block6ded16b2010-05-10 14:33:55 +0100110 void Initialize(Handle<String> data,
111 unibrow::CharacterStream* stream,
112 int start_position,
113 int end_position);
Steve Blocka7e24c12009-10-30 11:49:00 +0000114 virtual void PushBack(uc32 ch);
115 virtual uc32 Advance();
116 virtual void SeekForward(int pos);
117
118 private:
119 List<uc32> pushback_buffer_;
120 uc32 last_;
121 unibrow::CharacterStream* stream_;
122
123 List<uc32>* pushback_buffer() { return &pushback_buffer_; }
124};
125
126
Steve Block6ded16b2010-05-10 14:33:55 +0100127// UTF16 buffer to read characters from an external string.
128template <typename StringType, typename CharType>
129class ExternalStringUTF16Buffer: public UTF16Buffer {
Steve Blocka7e24c12009-10-30 11:49:00 +0000130 public:
Steve Block6ded16b2010-05-10 14:33:55 +0100131 ExternalStringUTF16Buffer();
132 virtual ~ExternalStringUTF16Buffer() {}
133 void Initialize(Handle<StringType> data,
134 int start_position,
135 int end_position);
Steve Blocka7e24c12009-10-30 11:49:00 +0000136 virtual void PushBack(uc32 ch);
137 virtual uc32 Advance();
138 virtual void SeekForward(int pos);
139
140 private:
Steve Block6ded16b2010-05-10 14:33:55 +0100141 const CharType* raw_data_; // Pointer to the actual array of characters.
Steve Blocka7e24c12009-10-30 11:49:00 +0000142};
143
144
Steve Blockd0582a62009-12-15 09:54:21 +0000145class KeywordMatcher {
146// Incrementally recognize keywords.
147//
148// Recognized keywords:
149// break case catch const* continue debugger* default delete do else
150// finally false for function if in instanceof native* new null
151// return switch this throw true try typeof var void while with
152//
153// *: Actually "future reserved keywords". These are the only ones we
154// recognized, the remaining are allowed as identifiers.
155 public:
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100156 KeywordMatcher()
157 : state_(INITIAL),
158 token_(Token::IDENTIFIER),
159 keyword_(NULL),
160 counter_(0),
161 keyword_token_(Token::ILLEGAL) {}
Steve Blockd0582a62009-12-15 09:54:21 +0000162
163 Token::Value token() { return token_; }
164
165 inline void AddChar(uc32 input) {
166 if (state_ != UNMATCHABLE) {
167 Step(input);
168 }
169 }
170
171 void Fail() {
172 token_ = Token::IDENTIFIER;
173 state_ = UNMATCHABLE;
174 }
175
176 private:
177 enum State {
178 UNMATCHABLE,
179 INITIAL,
180 KEYWORD_PREFIX,
181 KEYWORD_MATCHED,
182 C,
183 CA,
184 CO,
185 CON,
186 D,
187 DE,
188 F,
189 I,
190 IN,
191 N,
192 T,
193 TH,
194 TR,
195 V,
196 W
197 };
198
199 struct FirstState {
200 const char* keyword;
201 State state;
202 Token::Value token;
203 };
204
205 // Range of possible first characters of a keyword.
206 static const unsigned int kFirstCharRangeMin = 'b';
207 static const unsigned int kFirstCharRangeMax = 'w';
208 static const unsigned int kFirstCharRangeLength =
209 kFirstCharRangeMax - kFirstCharRangeMin + 1;
210 // State map for first keyword character range.
211 static FirstState first_states_[kFirstCharRangeLength];
212
Steve Blockd0582a62009-12-15 09:54:21 +0000213 // If input equals keyword's character at position, continue matching keyword
214 // from that position.
215 inline bool MatchKeywordStart(uc32 input,
216 const char* keyword,
217 int position,
218 Token::Value token_if_match) {
219 if (input == keyword[position]) {
220 state_ = KEYWORD_PREFIX;
221 this->keyword_ = keyword;
222 this->counter_ = position + 1;
223 this->keyword_token_ = token_if_match;
224 return true;
225 }
226 return false;
227 }
228
229 // If input equals match character, transition to new state and return true.
230 inline bool MatchState(uc32 input, char match, State new_state) {
231 if (input == match) {
232 state_ = new_state;
233 return true;
234 }
235 return false;
236 }
237
238 inline bool MatchKeyword(uc32 input,
239 char match,
240 State new_state,
241 Token::Value keyword_token) {
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100242 if (input != match) {
243 return false;
Steve Blockd0582a62009-12-15 09:54:21 +0000244 }
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100245 state_ = new_state;
246 token_ = keyword_token;
247 return true;
Steve Blockd0582a62009-12-15 09:54:21 +0000248 }
249
250 void Step(uc32 input);
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100251
252 // Current state.
253 State state_;
254 // Token for currently added characters.
255 Token::Value token_;
256
257 // Matching a specific keyword string (there is only one possible valid
258 // keyword with the current prefix).
259 const char* keyword_;
260 int counter_;
261 Token::Value keyword_token_;
Steve Blockd0582a62009-12-15 09:54:21 +0000262};
263
264
Leon Clarke4515c472010-02-03 11:58:03 +0000265enum ParserMode { PARSE, PREPARSE };
266enum ParserLanguage { JAVASCRIPT, JSON };
267
268
Steve Blocka7e24c12009-10-30 11:49:00 +0000269class Scanner {
270 public:
Steve Blocka7e24c12009-10-30 11:49:00 +0000271 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
272
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100273 class LiteralScope {
274 public:
275 explicit LiteralScope(Scanner* self);
276 ~LiteralScope();
277 void Complete();
278
279 private:
280 Scanner* scanner_;
281 bool complete_;
282 };
283
Kristian Monsen0d5e1162010-09-30 15:31:59 +0100284 Scanner();
Steve Blocka7e24c12009-10-30 11:49:00 +0000285
Steve Block6ded16b2010-05-10 14:33:55 +0100286 // Initialize the Scanner to scan source.
287 void Initialize(Handle<String> source,
288 ParserLanguage language);
289 void Initialize(Handle<String> source,
290 unibrow::CharacterStream* stream,
291 ParserLanguage language);
292 void Initialize(Handle<String> source,
293 int start_position, int end_position,
294 ParserLanguage language);
Steve Blocka7e24c12009-10-30 11:49:00 +0000295
296 // Returns the next token.
297 Token::Value Next();
298
299 // One token look-ahead (past the token returned by Next()).
Kristian Monsen0d5e1162010-09-30 15:31:59 +0100300 Token::Value peek() const { return next_.token; }
Steve Blocka7e24c12009-10-30 11:49:00 +0000301
302 // Returns true if there was a line terminator before the peek'ed token.
303 bool has_line_terminator_before_next() const {
304 return has_line_terminator_before_next_;
305 }
306
307 struct Location {
308 Location(int b, int e) : beg_pos(b), end_pos(e) { }
309 Location() : beg_pos(0), end_pos(0) { }
310 int beg_pos;
311 int end_pos;
312 };
313
314 // Returns the location information for the current token
315 // (the token returned by Next()).
Kristian Monsen0d5e1162010-09-30 15:31:59 +0100316 Location location() const { return current_.location; }
317 Location peek_location() const { return next_.location; }
Steve Blocka7e24c12009-10-30 11:49:00 +0000318
319 // Returns the literal string, if any, for the current token (the
320 // token returned by Next()). The string is 0-terminated and in
321 // UTF-8 format; they may contain 0-characters. Literal strings are
322 // collected for identifiers, strings, and numbers.
Steve Blockd0582a62009-12-15 09:54:21 +0000323 // These functions only give the correct result if the literal
324 // was scanned between calls to StartLiteral() and TerminateLiteral().
Steve Blocka7e24c12009-10-30 11:49:00 +0000325 const char* literal_string() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100326 return current_.literal_chars.start();
Steve Blocka7e24c12009-10-30 11:49:00 +0000327 }
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100328
Steve Blocka7e24c12009-10-30 11:49:00 +0000329 int literal_length() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100330 // Excluding terminal '\x00' added by TerminateLiteral().
331 return current_.literal_chars.length() - 1;
332 }
333
334 Vector<const char> literal() const {
335 return Vector<const char>(literal_string(), literal_length());
Steve Blocka7e24c12009-10-30 11:49:00 +0000336 }
337
338 // Returns the literal string for the next token (the token that
339 // would be returned if Next() were called).
340 const char* next_literal_string() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100341 return next_.literal_chars.start();
Steve Blocka7e24c12009-10-30 11:49:00 +0000342 }
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100343
344
Steve Blocka7e24c12009-10-30 11:49:00 +0000345 // Returns the length of the next token (that would be returned if
346 // Next() were called).
347 int next_literal_length() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100348 // Excluding terminal '\x00' added by TerminateLiteral().
349 return next_.literal_chars.length() - 1;
Steve Blockd0582a62009-12-15 09:54:21 +0000350 }
351
352 Vector<const char> next_literal() const {
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100353 return Vector<const char>(next_literal_string(), next_literal_length());
Steve Blocka7e24c12009-10-30 11:49:00 +0000354 }
355
356 // Scans the input as a regular expression pattern, previous
357 // character(s) must be /(=). Returns true if a pattern is scanned.
358 bool ScanRegExpPattern(bool seen_equal);
359 // Returns true if regexp flags are scanned (always since flags can
360 // be empty).
361 bool ScanRegExpFlags();
362
363 // Seek forward to the given position. This operation does not
364 // work in general, for instance when there are pushed back
365 // characters, but works for seeking forward until simple delimiter
366 // tokens, which is what it is used for.
367 void SeekForward(int pos);
368
Steve Blocka7e24c12009-10-30 11:49:00 +0000369 bool stack_overflow() { return stack_overflow_; }
370
371 static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
372
373 // Tells whether the buffer contains an identifier (no escapes).
374 // Used for checking if a property name is an identifier.
375 static bool IsIdentifier(unibrow::CharacterStream* buffer);
376
377 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
378 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
379 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
380 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
381
382 static const int kCharacterLookaheadBufferSize = 1;
Steve Block6ded16b2010-05-10 14:33:55 +0100383 static const int kNoEndPosition = 1;
Steve Blocka7e24c12009-10-30 11:49:00 +0000384
385 private:
Steve Blocka7e24c12009-10-30 11:49:00 +0000386 // The current and look-ahead token.
387 struct TokenDesc {
388 Token::Value token;
389 Location location;
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100390 Vector<const char> literal_chars;
Steve Blocka7e24c12009-10-30 11:49:00 +0000391 };
392
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100393 void Init(Handle<String> source,
394 unibrow::CharacterStream* stream,
395 int start_position, int end_position,
396 ParserLanguage language);
Steve Blocka7e24c12009-10-30 11:49:00 +0000397
398 // Literal buffer support
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100399 inline void StartLiteral();
400 inline void AddChar(uc32 ch);
401 inline void AddCharAdvance();
402 inline void TerminateLiteral();
403 // Stops scanning of a literal, e.g., due to an encountered error.
404 inline void DropLiteral();
Steve Blocka7e24c12009-10-30 11:49:00 +0000405
406 // Low-level scanning support.
407 void Advance() { c0_ = source_->Advance(); }
408 void PushBack(uc32 ch) {
409 source_->PushBack(ch);
410 c0_ = ch;
411 }
412
Leon Clarke4515c472010-02-03 11:58:03 +0000413 bool SkipWhiteSpace() {
414 if (is_parsing_json_) {
415 return SkipJsonWhiteSpace();
416 } else {
417 return SkipJavaScriptWhiteSpace();
418 }
419 }
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100420
Leon Clarke4515c472010-02-03 11:58:03 +0000421 bool SkipJavaScriptWhiteSpace();
422 bool SkipJsonWhiteSpace();
Steve Blocka7e24c12009-10-30 11:49:00 +0000423 Token::Value SkipSingleLineComment();
424 Token::Value SkipMultiLineComment();
425
426 inline Token::Value Select(Token::Value tok);
427 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
428
Leon Clarke4515c472010-02-03 11:58:03 +0000429 inline void Scan() {
430 if (is_parsing_json_) {
431 ScanJson();
432 } else {
433 ScanJavaScript();
434 }
435 }
436
437 // Scans a single JavaScript token.
438 void ScanJavaScript();
439
440 // Scan a single JSON token. The JSON lexical grammar is specified in the
441 // ECMAScript 5 standard, section 15.12.1.1.
442 // Recognizes all of the single-character tokens directly, or calls a function
443 // to scan a number, string or identifier literal.
444 // The only allowed whitespace characters between tokens are tab,
445 // carrige-return, newline and space.
446 void ScanJson();
447
448 // A JSON number (production JSONNumber) is a subset of the valid JavaScript
449 // decimal number literals.
450 // It includes an optional minus sign, must have at least one
451 // digit before and after a decimal point, may not have prefixed zeros (unless
452 // the integer part is zero), and may include an exponent part (e.g., "e-10").
453 // Hexadecimal and octal numbers are not allowed.
454 Token::Value ScanJsonNumber();
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100455
Leon Clarke4515c472010-02-03 11:58:03 +0000456 // A JSON string (production JSONString) is subset of valid JavaScript string
457 // literals. The string must only be double-quoted (not single-quoted), and
458 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
459 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
460 Token::Value ScanJsonString();
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100461
Leon Clarke4515c472010-02-03 11:58:03 +0000462 // Used to recognizes one of the literals "true", "false", or "null". These
463 // are the only valid JSON identifiers (productions JSONBooleanLiteral,
464 // JSONNullLiteral).
465 Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
466
Steve Blocka7e24c12009-10-30 11:49:00 +0000467 void ScanDecimalDigits();
468 Token::Value ScanNumber(bool seen_period);
469 Token::Value ScanIdentifier();
470 uc32 ScanHexEscape(uc32 c, int length);
471 uc32 ScanOctalEscape(uc32 c, int length);
472 void ScanEscape();
473 Token::Value ScanString();
474
475 // Scans a possible HTML comment -- begins with '<!'.
476 Token::Value ScanHtmlComment();
477
478 // Return the current source position.
479 int source_pos() {
Steve Block6ded16b2010-05-10 14:33:55 +0100480 return source_->pos() - kCharacterLookaheadBufferSize;
Steve Blocka7e24c12009-10-30 11:49:00 +0000481 }
482
483 // Decodes a unicode escape-sequence which is part of an identifier.
484 // If the escape sequence cannot be decoded the result is kBadRune.
485 uc32 ScanIdentifierUnicodeEscape();
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100486
487 TokenDesc current_; // desc for current token (as returned by Next())
488 TokenDesc next_; // desc for next token (one token look-ahead)
489 bool has_line_terminator_before_next_;
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100490 bool is_parsing_json_;
491
492 // Different UTF16 buffers used to pull characters from. Based on input one of
493 // these will be initialized as the actual data source.
494 CharacterStreamUTF16Buffer char_stream_buffer_;
495 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
496 two_byte_string_buffer_;
497 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
498
499 // Source. Will point to one of the buffers declared above.
500 UTF16Buffer* source_;
501
502 // Used to convert the source string into a character stream when a stream
503 // is not passed to the scanner.
504 SafeStringInputBuffer safe_string_input_buffer_;
505
506 // Buffer to hold literal values (identifiers, strings, numbers)
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100507 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
508 UTF8Buffer literal_buffer_;
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100509
510 bool stack_overflow_;
511 static StaticResource<Utf8Decoder> utf8_decoder_;
512
513 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
514 uc32 c0_;
Steve Blocka7e24c12009-10-30 11:49:00 +0000515};
516
517} } // namespace v8::internal
518
519#endif // V8_SCANNER_H_