blob: f0035c0eb3e8b633f5c5761fe8eb41c5aa6f7e87 [file] [log] [blame]
Steve Blocka7e24c12009-10-30 11:49:00 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_SCANNER_H_
29#define V8_SCANNER_H_
30
31#include "token.h"
32#include "char-predicates-inl.h"
33
34namespace v8 {
35namespace internal {
36
37
38class UTF8Buffer {
39 public:
40 UTF8Buffer();
41 ~UTF8Buffer();
42
43 void AddChar(uc32 c) {
Steve Blockd0582a62009-12-15 09:54:21 +000044 ASSERT_NOT_NULL(data_);
Steve Blocka7e24c12009-10-30 11:49:00 +000045 if (cursor_ <= limit_ &&
46 static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
47 *cursor_++ = static_cast<char>(c);
48 } else {
49 AddCharSlow(c);
50 }
51 }
52
Steve Blockd0582a62009-12-15 09:54:21 +000053 void Reset() {
54 if (data_ == NULL) {
55 data_ = NewArray<char>(kInitialCapacity);
56 limit_ = ComputeLimit(data_, kInitialCapacity);
57 }
58 cursor_ = data_;
59 }
60
61 int pos() const {
62 ASSERT_NOT_NULL(data_);
63 return static_cast<int>(cursor_ - data_);
64 }
65
Steve Blocka7e24c12009-10-30 11:49:00 +000066 char* data() const { return data_; }
67
68 private:
Steve Blockd0582a62009-12-15 09:54:21 +000069 static const int kInitialCapacity = 256;
Steve Blocka7e24c12009-10-30 11:49:00 +000070 char* data_;
71 char* cursor_;
72 char* limit_;
73
74 int Capacity() const {
Steve Blockd0582a62009-12-15 09:54:21 +000075 ASSERT_NOT_NULL(data_);
76 return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize;
Steve Blocka7e24c12009-10-30 11:49:00 +000077 }
78
79 static char* ComputeLimit(char* data, int capacity) {
80 return (data + capacity) - unibrow::Utf8::kMaxEncodedSize;
81 }
82
83 void AddCharSlow(uc32 c);
84};
85
86
87class UTF16Buffer {
88 public:
89 UTF16Buffer();
90 virtual ~UTF16Buffer() {}
91
92 virtual void PushBack(uc32 ch) = 0;
93 // returns a value < 0 when the buffer end is reached
94 virtual uc32 Advance() = 0;
95 virtual void SeekForward(int pos) = 0;
96
97 int pos() const { return pos_; }
98 int size() const { return size_; }
99 Handle<String> SubString(int start, int end);
100
101 protected:
102 Handle<String> data_;
103 int pos_;
104 int size_;
105};
106
107
108class CharacterStreamUTF16Buffer: public UTF16Buffer {
109 public:
110 CharacterStreamUTF16Buffer();
111 virtual ~CharacterStreamUTF16Buffer() {}
112 void Initialize(Handle<String> data, unibrow::CharacterStream* stream);
113 virtual void PushBack(uc32 ch);
114 virtual uc32 Advance();
115 virtual void SeekForward(int pos);
116
117 private:
118 List<uc32> pushback_buffer_;
119 uc32 last_;
120 unibrow::CharacterStream* stream_;
121
122 List<uc32>* pushback_buffer() { return &pushback_buffer_; }
123};
124
125
126class TwoByteStringUTF16Buffer: public UTF16Buffer {
127 public:
128 TwoByteStringUTF16Buffer();
129 virtual ~TwoByteStringUTF16Buffer() {}
130 void Initialize(Handle<ExternalTwoByteString> data);
131 virtual void PushBack(uc32 ch);
132 virtual uc32 Advance();
133 virtual void SeekForward(int pos);
134
135 private:
136 const uint16_t* raw_data_;
137};
138
139
Steve Blockd0582a62009-12-15 09:54:21 +0000140class KeywordMatcher {
141// Incrementally recognize keywords.
142//
143// Recognized keywords:
144// break case catch const* continue debugger* default delete do else
145// finally false for function if in instanceof native* new null
146// return switch this throw true try typeof var void while with
147//
148// *: Actually "future reserved keywords". These are the only ones we
149// recognized, the remaining are allowed as identifiers.
150 public:
151 KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {}
152
153 Token::Value token() { return token_; }
154
155 inline void AddChar(uc32 input) {
156 if (state_ != UNMATCHABLE) {
157 Step(input);
158 }
159 }
160
161 void Fail() {
162 token_ = Token::IDENTIFIER;
163 state_ = UNMATCHABLE;
164 }
165
166 private:
167 enum State {
168 UNMATCHABLE,
169 INITIAL,
170 KEYWORD_PREFIX,
171 KEYWORD_MATCHED,
172 C,
173 CA,
174 CO,
175 CON,
176 D,
177 DE,
178 F,
179 I,
180 IN,
181 N,
182 T,
183 TH,
184 TR,
185 V,
186 W
187 };
188
189 struct FirstState {
190 const char* keyword;
191 State state;
192 Token::Value token;
193 };
194
195 // Range of possible first characters of a keyword.
196 static const unsigned int kFirstCharRangeMin = 'b';
197 static const unsigned int kFirstCharRangeMax = 'w';
198 static const unsigned int kFirstCharRangeLength =
199 kFirstCharRangeMax - kFirstCharRangeMin + 1;
200 // State map for first keyword character range.
201 static FirstState first_states_[kFirstCharRangeLength];
202
203 // Current state.
204 State state_;
205 // Token for currently added characters.
206 Token::Value token_;
207
208 // Matching a specific keyword string (there is only one possible valid
209 // keyword with the current prefix).
210 const char* keyword_;
211 int counter_;
212 Token::Value keyword_token_;
213
214 // If input equals keyword's character at position, continue matching keyword
215 // from that position.
216 inline bool MatchKeywordStart(uc32 input,
217 const char* keyword,
218 int position,
219 Token::Value token_if_match) {
220 if (input == keyword[position]) {
221 state_ = KEYWORD_PREFIX;
222 this->keyword_ = keyword;
223 this->counter_ = position + 1;
224 this->keyword_token_ = token_if_match;
225 return true;
226 }
227 return false;
228 }
229
230 // If input equals match character, transition to new state and return true.
231 inline bool MatchState(uc32 input, char match, State new_state) {
232 if (input == match) {
233 state_ = new_state;
234 return true;
235 }
236 return false;
237 }
238
239 inline bool MatchKeyword(uc32 input,
240 char match,
241 State new_state,
242 Token::Value keyword_token) {
243 if (input == match) { // Matched "do".
244 state_ = new_state;
245 token_ = keyword_token;
246 return true;
247 }
248 return false;
249 }
250
251 void Step(uc32 input);
252};
253
254
Leon Clarke4515c472010-02-03 11:58:03 +0000255enum ParserMode { PARSE, PREPARSE };
256enum ParserLanguage { JAVASCRIPT, JSON };
257
258
Steve Blocka7e24c12009-10-30 11:49:00 +0000259class Scanner {
260 public:
Steve Blocka7e24c12009-10-30 11:49:00 +0000261 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
262
263 // Construction
Leon Clarke4515c472010-02-03 11:58:03 +0000264 explicit Scanner(ParserMode parse_mode);
Steve Blocka7e24c12009-10-30 11:49:00 +0000265
266 // Initialize the Scanner to scan source:
267 void Init(Handle<String> source,
268 unibrow::CharacterStream* stream,
Leon Clarke4515c472010-02-03 11:58:03 +0000269 int position,
270 ParserLanguage language);
Steve Blocka7e24c12009-10-30 11:49:00 +0000271
272 // Returns the next token.
273 Token::Value Next();
274
275 // One token look-ahead (past the token returned by Next()).
276 Token::Value peek() const { return next_.token; }
277
278 // Returns true if there was a line terminator before the peek'ed token.
279 bool has_line_terminator_before_next() const {
280 return has_line_terminator_before_next_;
281 }
282
283 struct Location {
284 Location(int b, int e) : beg_pos(b), end_pos(e) { }
285 Location() : beg_pos(0), end_pos(0) { }
286 int beg_pos;
287 int end_pos;
288 };
289
290 // Returns the location information for the current token
291 // (the token returned by Next()).
292 Location location() const { return current_.location; }
293 Location peek_location() const { return next_.location; }
294
295 // Returns the literal string, if any, for the current token (the
296 // token returned by Next()). The string is 0-terminated and in
297 // UTF-8 format; they may contain 0-characters. Literal strings are
298 // collected for identifiers, strings, and numbers.
Steve Blockd0582a62009-12-15 09:54:21 +0000299 // These functions only give the correct result if the literal
300 // was scanned between calls to StartLiteral() and TerminateLiteral().
Steve Blocka7e24c12009-10-30 11:49:00 +0000301 const char* literal_string() const {
Steve Blockd0582a62009-12-15 09:54:21 +0000302 return current_.literal_buffer->data();
Steve Blocka7e24c12009-10-30 11:49:00 +0000303 }
304 int literal_length() const {
Steve Blockd0582a62009-12-15 09:54:21 +0000305 // Excluding terminal '\0' added by TerminateLiteral().
306 return current_.literal_buffer->pos() - 1;
Steve Blocka7e24c12009-10-30 11:49:00 +0000307 }
308
309 // Returns the literal string for the next token (the token that
310 // would be returned if Next() were called).
311 const char* next_literal_string() const {
Steve Blockd0582a62009-12-15 09:54:21 +0000312 return next_.literal_buffer->data();
Steve Blocka7e24c12009-10-30 11:49:00 +0000313 }
314 // Returns the length of the next token (that would be returned if
315 // Next() were called).
316 int next_literal_length() const {
Steve Blockd0582a62009-12-15 09:54:21 +0000317 return next_.literal_buffer->pos() - 1;
318 }
319
320 Vector<const char> next_literal() const {
321 return Vector<const char>(next_literal_string(),
322 next_literal_length());
Steve Blocka7e24c12009-10-30 11:49:00 +0000323 }
324
325 // Scans the input as a regular expression pattern, previous
326 // character(s) must be /(=). Returns true if a pattern is scanned.
327 bool ScanRegExpPattern(bool seen_equal);
328 // Returns true if regexp flags are scanned (always since flags can
329 // be empty).
330 bool ScanRegExpFlags();
331
332 // Seek forward to the given position. This operation does not
333 // work in general, for instance when there are pushed back
334 // characters, but works for seeking forward until simple delimiter
335 // tokens, which is what it is used for.
336 void SeekForward(int pos);
337
338 Handle<String> SubString(int start_pos, int end_pos);
339 bool stack_overflow() { return stack_overflow_; }
340
341 static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
342
343 // Tells whether the buffer contains an identifier (no escapes).
344 // Used for checking if a property name is an identifier.
345 static bool IsIdentifier(unibrow::CharacterStream* buffer);
346
347 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
348 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
349 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
350 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
351
352 static const int kCharacterLookaheadBufferSize = 1;
353
354 private:
355 CharacterStreamUTF16Buffer char_stream_buffer_;
356 TwoByteStringUTF16Buffer two_byte_string_buffer_;
357
358 // Source.
359 UTF16Buffer* source_;
360 int position_;
361
362 // Buffer to hold literal values (identifiers, strings, numbers)
363 // using 0-terminated UTF-8 encoding.
Steve Blockd0582a62009-12-15 09:54:21 +0000364 UTF8Buffer literal_buffer_1_;
365 UTF8Buffer literal_buffer_2_;
Steve Blocka7e24c12009-10-30 11:49:00 +0000366
367 bool stack_overflow_;
368 static StaticResource<Utf8Decoder> utf8_decoder_;
369
370 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
371 uc32 c0_;
372
373 // The current and look-ahead token.
374 struct TokenDesc {
375 Token::Value token;
376 Location location;
Steve Blockd0582a62009-12-15 09:54:21 +0000377 UTF8Buffer* literal_buffer;
Steve Blocka7e24c12009-10-30 11:49:00 +0000378 };
379
380 TokenDesc current_; // desc for current token (as returned by Next())
381 TokenDesc next_; // desc for next token (one token look-ahead)
382 bool has_line_terminator_before_next_;
383 bool is_pre_parsing_;
Leon Clarke4515c472010-02-03 11:58:03 +0000384 bool is_parsing_json_;
Steve Blocka7e24c12009-10-30 11:49:00 +0000385
386 // Literal buffer support
387 void StartLiteral();
388 void AddChar(uc32 ch);
389 void AddCharAdvance();
390 void TerminateLiteral();
391
392 // Low-level scanning support.
393 void Advance() { c0_ = source_->Advance(); }
394 void PushBack(uc32 ch) {
395 source_->PushBack(ch);
396 c0_ = ch;
397 }
398
Leon Clarke4515c472010-02-03 11:58:03 +0000399 bool SkipWhiteSpace() {
400 if (is_parsing_json_) {
401 return SkipJsonWhiteSpace();
402 } else {
403 return SkipJavaScriptWhiteSpace();
404 }
405 }
406 bool SkipJavaScriptWhiteSpace();
407 bool SkipJsonWhiteSpace();
Steve Blocka7e24c12009-10-30 11:49:00 +0000408 Token::Value SkipSingleLineComment();
409 Token::Value SkipMultiLineComment();
410
411 inline Token::Value Select(Token::Value tok);
412 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
413
Leon Clarke4515c472010-02-03 11:58:03 +0000414 inline void Scan() {
415 if (is_parsing_json_) {
416 ScanJson();
417 } else {
418 ScanJavaScript();
419 }
420 }
421
422 // Scans a single JavaScript token.
423 void ScanJavaScript();
424
425 // Scan a single JSON token. The JSON lexical grammar is specified in the
426 // ECMAScript 5 standard, section 15.12.1.1.
427 // Recognizes all of the single-character tokens directly, or calls a function
428 // to scan a number, string or identifier literal.
429 // The only allowed whitespace characters between tokens are tab,
430 // carrige-return, newline and space.
431 void ScanJson();
432
433 // A JSON number (production JSONNumber) is a subset of the valid JavaScript
434 // decimal number literals.
435 // It includes an optional minus sign, must have at least one
436 // digit before and after a decimal point, may not have prefixed zeros (unless
437 // the integer part is zero), and may include an exponent part (e.g., "e-10").
438 // Hexadecimal and octal numbers are not allowed.
439 Token::Value ScanJsonNumber();
440 // A JSON string (production JSONString) is subset of valid JavaScript string
441 // literals. The string must only be double-quoted (not single-quoted), and
442 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
443 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
444 Token::Value ScanJsonString();
445 // Used to recognizes one of the literals "true", "false", or "null". These
446 // are the only valid JSON identifiers (productions JSONBooleanLiteral,
447 // JSONNullLiteral).
448 Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
449
Steve Blocka7e24c12009-10-30 11:49:00 +0000450 void ScanDecimalDigits();
451 Token::Value ScanNumber(bool seen_period);
452 Token::Value ScanIdentifier();
453 uc32 ScanHexEscape(uc32 c, int length);
454 uc32 ScanOctalEscape(uc32 c, int length);
455 void ScanEscape();
456 Token::Value ScanString();
457
458 // Scans a possible HTML comment -- begins with '<!'.
459 Token::Value ScanHtmlComment();
460
461 // Return the current source position.
462 int source_pos() {
463 return source_->pos() - kCharacterLookaheadBufferSize + position_;
464 }
465
466 // Decodes a unicode escape-sequence which is part of an identifier.
467 // If the escape sequence cannot be decoded the result is kBadRune.
468 uc32 ScanIdentifierUnicodeEscape();
469};
470
471} } // namespace v8::internal
472
473#endif // V8_SCANNER_H_