Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 1 | // Copyright 2006-2008 the V8 project authors. All rights reserved. |
| 2 | // Redistribution and use in source and binary forms, with or without |
| 3 | // modification, are permitted provided that the following conditions are |
| 4 | // met: |
| 5 | // |
| 6 | // * Redistributions of source code must retain the above copyright |
| 7 | // notice, this list of conditions and the following disclaimer. |
| 8 | // * Redistributions in binary form must reproduce the above |
| 9 | // copyright notice, this list of conditions and the following |
| 10 | // disclaimer in the documentation and/or other materials provided |
| 11 | // with the distribution. |
| 12 | // * Neither the name of Google Inc. nor the names of its |
| 13 | // contributors may be used to endorse or promote products derived |
| 14 | // from this software without specific prior written permission. |
| 15 | // |
| 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 | |
| 28 | #ifndef V8_SCANNER_H_ |
| 29 | #define V8_SCANNER_H_ |
| 30 | |
| 31 | #include "token.h" |
| 32 | #include "char-predicates-inl.h" |
| 33 | |
| 34 | namespace v8 { |
| 35 | namespace internal { |
| 36 | |
| 37 | |
| 38 | class UTF8Buffer { |
| 39 | public: |
| 40 | UTF8Buffer(); |
| 41 | ~UTF8Buffer(); |
| 42 | |
| 43 | void AddChar(uc32 c) { |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 44 | ASSERT_NOT_NULL(data_); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 45 | if (cursor_ <= limit_ && |
| 46 | static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { |
| 47 | *cursor_++ = static_cast<char>(c); |
| 48 | } else { |
| 49 | AddCharSlow(c); |
| 50 | } |
| 51 | } |
| 52 | |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 53 | void Reset() { |
| 54 | if (data_ == NULL) { |
| 55 | data_ = NewArray<char>(kInitialCapacity); |
| 56 | limit_ = ComputeLimit(data_, kInitialCapacity); |
| 57 | } |
| 58 | cursor_ = data_; |
| 59 | } |
| 60 | |
| 61 | int pos() const { |
| 62 | ASSERT_NOT_NULL(data_); |
| 63 | return static_cast<int>(cursor_ - data_); |
| 64 | } |
| 65 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 66 | char* data() const { return data_; } |
| 67 | |
| 68 | private: |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 69 | static const int kInitialCapacity = 256; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 70 | char* data_; |
| 71 | char* cursor_; |
| 72 | char* limit_; |
| 73 | |
| 74 | int Capacity() const { |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 75 | ASSERT_NOT_NULL(data_); |
| 76 | return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 77 | } |
| 78 | |
| 79 | static char* ComputeLimit(char* data, int capacity) { |
| 80 | return (data + capacity) - unibrow::Utf8::kMaxEncodedSize; |
| 81 | } |
| 82 | |
| 83 | void AddCharSlow(uc32 c); |
| 84 | }; |
| 85 | |
| 86 | |
| 87 | class UTF16Buffer { |
| 88 | public: |
| 89 | UTF16Buffer(); |
| 90 | virtual ~UTF16Buffer() {} |
| 91 | |
| 92 | virtual void PushBack(uc32 ch) = 0; |
| 93 | // returns a value < 0 when the buffer end is reached |
| 94 | virtual uc32 Advance() = 0; |
| 95 | virtual void SeekForward(int pos) = 0; |
| 96 | |
| 97 | int pos() const { return pos_; } |
| 98 | int size() const { return size_; } |
| 99 | Handle<String> SubString(int start, int end); |
| 100 | |
| 101 | protected: |
| 102 | Handle<String> data_; |
| 103 | int pos_; |
| 104 | int size_; |
| 105 | }; |
| 106 | |
| 107 | |
| 108 | class CharacterStreamUTF16Buffer: public UTF16Buffer { |
| 109 | public: |
| 110 | CharacterStreamUTF16Buffer(); |
| 111 | virtual ~CharacterStreamUTF16Buffer() {} |
| 112 | void Initialize(Handle<String> data, unibrow::CharacterStream* stream); |
| 113 | virtual void PushBack(uc32 ch); |
| 114 | virtual uc32 Advance(); |
| 115 | virtual void SeekForward(int pos); |
| 116 | |
| 117 | private: |
| 118 | List<uc32> pushback_buffer_; |
| 119 | uc32 last_; |
| 120 | unibrow::CharacterStream* stream_; |
| 121 | |
| 122 | List<uc32>* pushback_buffer() { return &pushback_buffer_; } |
| 123 | }; |
| 124 | |
| 125 | |
| 126 | class TwoByteStringUTF16Buffer: public UTF16Buffer { |
| 127 | public: |
| 128 | TwoByteStringUTF16Buffer(); |
| 129 | virtual ~TwoByteStringUTF16Buffer() {} |
| 130 | void Initialize(Handle<ExternalTwoByteString> data); |
| 131 | virtual void PushBack(uc32 ch); |
| 132 | virtual uc32 Advance(); |
| 133 | virtual void SeekForward(int pos); |
| 134 | |
| 135 | private: |
| 136 | const uint16_t* raw_data_; |
| 137 | }; |
| 138 | |
| 139 | |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 140 | class KeywordMatcher { |
| 141 | // Incrementally recognize keywords. |
| 142 | // |
| 143 | // Recognized keywords: |
| 144 | // break case catch const* continue debugger* default delete do else |
| 145 | // finally false for function if in instanceof native* new null |
| 146 | // return switch this throw true try typeof var void while with |
| 147 | // |
| 148 | // *: Actually "future reserved keywords". These are the only ones we |
| 149 | // recognized, the remaining are allowed as identifiers. |
| 150 | public: |
| 151 | KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {} |
| 152 | |
| 153 | Token::Value token() { return token_; } |
| 154 | |
| 155 | inline void AddChar(uc32 input) { |
| 156 | if (state_ != UNMATCHABLE) { |
| 157 | Step(input); |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | void Fail() { |
| 162 | token_ = Token::IDENTIFIER; |
| 163 | state_ = UNMATCHABLE; |
| 164 | } |
| 165 | |
| 166 | private: |
| 167 | enum State { |
| 168 | UNMATCHABLE, |
| 169 | INITIAL, |
| 170 | KEYWORD_PREFIX, |
| 171 | KEYWORD_MATCHED, |
| 172 | C, |
| 173 | CA, |
| 174 | CO, |
| 175 | CON, |
| 176 | D, |
| 177 | DE, |
| 178 | F, |
| 179 | I, |
| 180 | IN, |
| 181 | N, |
| 182 | T, |
| 183 | TH, |
| 184 | TR, |
| 185 | V, |
| 186 | W |
| 187 | }; |
| 188 | |
| 189 | struct FirstState { |
| 190 | const char* keyword; |
| 191 | State state; |
| 192 | Token::Value token; |
| 193 | }; |
| 194 | |
| 195 | // Range of possible first characters of a keyword. |
| 196 | static const unsigned int kFirstCharRangeMin = 'b'; |
| 197 | static const unsigned int kFirstCharRangeMax = 'w'; |
| 198 | static const unsigned int kFirstCharRangeLength = |
| 199 | kFirstCharRangeMax - kFirstCharRangeMin + 1; |
| 200 | // State map for first keyword character range. |
| 201 | static FirstState first_states_[kFirstCharRangeLength]; |
| 202 | |
| 203 | // Current state. |
| 204 | State state_; |
| 205 | // Token for currently added characters. |
| 206 | Token::Value token_; |
| 207 | |
| 208 | // Matching a specific keyword string (there is only one possible valid |
| 209 | // keyword with the current prefix). |
| 210 | const char* keyword_; |
| 211 | int counter_; |
| 212 | Token::Value keyword_token_; |
| 213 | |
| 214 | // If input equals keyword's character at position, continue matching keyword |
| 215 | // from that position. |
| 216 | inline bool MatchKeywordStart(uc32 input, |
| 217 | const char* keyword, |
| 218 | int position, |
| 219 | Token::Value token_if_match) { |
| 220 | if (input == keyword[position]) { |
| 221 | state_ = KEYWORD_PREFIX; |
| 222 | this->keyword_ = keyword; |
| 223 | this->counter_ = position + 1; |
| 224 | this->keyword_token_ = token_if_match; |
| 225 | return true; |
| 226 | } |
| 227 | return false; |
| 228 | } |
| 229 | |
| 230 | // If input equals match character, transition to new state and return true. |
| 231 | inline bool MatchState(uc32 input, char match, State new_state) { |
| 232 | if (input == match) { |
| 233 | state_ = new_state; |
| 234 | return true; |
| 235 | } |
| 236 | return false; |
| 237 | } |
| 238 | |
| 239 | inline bool MatchKeyword(uc32 input, |
| 240 | char match, |
| 241 | State new_state, |
| 242 | Token::Value keyword_token) { |
| 243 | if (input == match) { // Matched "do". |
| 244 | state_ = new_state; |
| 245 | token_ = keyword_token; |
| 246 | return true; |
| 247 | } |
| 248 | return false; |
| 249 | } |
| 250 | |
| 251 | void Step(uc32 input); |
| 252 | }; |
| 253 | |
| 254 | |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame^] | 255 | enum ParserMode { PARSE, PREPARSE }; |
| 256 | enum ParserLanguage { JAVASCRIPT, JSON }; |
| 257 | |
| 258 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 259 | class Scanner { |
| 260 | public: |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 261 | typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
| 262 | |
| 263 | // Construction |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame^] | 264 | explicit Scanner(ParserMode parse_mode); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 265 | |
| 266 | // Initialize the Scanner to scan source: |
| 267 | void Init(Handle<String> source, |
| 268 | unibrow::CharacterStream* stream, |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame^] | 269 | int position, |
| 270 | ParserLanguage language); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 271 | |
| 272 | // Returns the next token. |
| 273 | Token::Value Next(); |
| 274 | |
| 275 | // One token look-ahead (past the token returned by Next()). |
| 276 | Token::Value peek() const { return next_.token; } |
| 277 | |
| 278 | // Returns true if there was a line terminator before the peek'ed token. |
| 279 | bool has_line_terminator_before_next() const { |
| 280 | return has_line_terminator_before_next_; |
| 281 | } |
| 282 | |
| 283 | struct Location { |
| 284 | Location(int b, int e) : beg_pos(b), end_pos(e) { } |
| 285 | Location() : beg_pos(0), end_pos(0) { } |
| 286 | int beg_pos; |
| 287 | int end_pos; |
| 288 | }; |
| 289 | |
| 290 | // Returns the location information for the current token |
| 291 | // (the token returned by Next()). |
| 292 | Location location() const { return current_.location; } |
| 293 | Location peek_location() const { return next_.location; } |
| 294 | |
| 295 | // Returns the literal string, if any, for the current token (the |
| 296 | // token returned by Next()). The string is 0-terminated and in |
| 297 | // UTF-8 format; they may contain 0-characters. Literal strings are |
| 298 | // collected for identifiers, strings, and numbers. |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 299 | // These functions only give the correct result if the literal |
| 300 | // was scanned between calls to StartLiteral() and TerminateLiteral(). |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 301 | const char* literal_string() const { |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 302 | return current_.literal_buffer->data(); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 303 | } |
| 304 | int literal_length() const { |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 305 | // Excluding terminal '\0' added by TerminateLiteral(). |
| 306 | return current_.literal_buffer->pos() - 1; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 307 | } |
| 308 | |
| 309 | // Returns the literal string for the next token (the token that |
| 310 | // would be returned if Next() were called). |
| 311 | const char* next_literal_string() const { |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 312 | return next_.literal_buffer->data(); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 313 | } |
| 314 | // Returns the length of the next token (that would be returned if |
| 315 | // Next() were called). |
| 316 | int next_literal_length() const { |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 317 | return next_.literal_buffer->pos() - 1; |
| 318 | } |
| 319 | |
| 320 | Vector<const char> next_literal() const { |
| 321 | return Vector<const char>(next_literal_string(), |
| 322 | next_literal_length()); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 323 | } |
| 324 | |
| 325 | // Scans the input as a regular expression pattern, previous |
| 326 | // character(s) must be /(=). Returns true if a pattern is scanned. |
| 327 | bool ScanRegExpPattern(bool seen_equal); |
| 328 | // Returns true if regexp flags are scanned (always since flags can |
| 329 | // be empty). |
| 330 | bool ScanRegExpFlags(); |
| 331 | |
| 332 | // Seek forward to the given position. This operation does not |
| 333 | // work in general, for instance when there are pushed back |
| 334 | // characters, but works for seeking forward until simple delimiter |
| 335 | // tokens, which is what it is used for. |
| 336 | void SeekForward(int pos); |
| 337 | |
| 338 | Handle<String> SubString(int start_pos, int end_pos); |
| 339 | bool stack_overflow() { return stack_overflow_; } |
| 340 | |
| 341 | static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; } |
| 342 | |
| 343 | // Tells whether the buffer contains an identifier (no escapes). |
| 344 | // Used for checking if a property name is an identifier. |
| 345 | static bool IsIdentifier(unibrow::CharacterStream* buffer); |
| 346 | |
| 347 | static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; |
| 348 | static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; |
| 349 | static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; |
| 350 | static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; |
| 351 | |
| 352 | static const int kCharacterLookaheadBufferSize = 1; |
| 353 | |
| 354 | private: |
| 355 | CharacterStreamUTF16Buffer char_stream_buffer_; |
| 356 | TwoByteStringUTF16Buffer two_byte_string_buffer_; |
| 357 | |
| 358 | // Source. |
| 359 | UTF16Buffer* source_; |
| 360 | int position_; |
| 361 | |
| 362 | // Buffer to hold literal values (identifiers, strings, numbers) |
| 363 | // using 0-terminated UTF-8 encoding. |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 364 | UTF8Buffer literal_buffer_1_; |
| 365 | UTF8Buffer literal_buffer_2_; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 366 | |
| 367 | bool stack_overflow_; |
| 368 | static StaticResource<Utf8Decoder> utf8_decoder_; |
| 369 | |
| 370 | // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
| 371 | uc32 c0_; |
| 372 | |
| 373 | // The current and look-ahead token. |
| 374 | struct TokenDesc { |
| 375 | Token::Value token; |
| 376 | Location location; |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 377 | UTF8Buffer* literal_buffer; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 378 | }; |
| 379 | |
| 380 | TokenDesc current_; // desc for current token (as returned by Next()) |
| 381 | TokenDesc next_; // desc for next token (one token look-ahead) |
| 382 | bool has_line_terminator_before_next_; |
| 383 | bool is_pre_parsing_; |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame^] | 384 | bool is_parsing_json_; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 385 | |
| 386 | // Literal buffer support |
| 387 | void StartLiteral(); |
| 388 | void AddChar(uc32 ch); |
| 389 | void AddCharAdvance(); |
| 390 | void TerminateLiteral(); |
| 391 | |
| 392 | // Low-level scanning support. |
| 393 | void Advance() { c0_ = source_->Advance(); } |
| 394 | void PushBack(uc32 ch) { |
| 395 | source_->PushBack(ch); |
| 396 | c0_ = ch; |
| 397 | } |
| 398 | |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame^] | 399 | bool SkipWhiteSpace() { |
| 400 | if (is_parsing_json_) { |
| 401 | return SkipJsonWhiteSpace(); |
| 402 | } else { |
| 403 | return SkipJavaScriptWhiteSpace(); |
| 404 | } |
| 405 | } |
| 406 | bool SkipJavaScriptWhiteSpace(); |
| 407 | bool SkipJsonWhiteSpace(); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 408 | Token::Value SkipSingleLineComment(); |
| 409 | Token::Value SkipMultiLineComment(); |
| 410 | |
| 411 | inline Token::Value Select(Token::Value tok); |
| 412 | inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); |
| 413 | |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame^] | 414 | inline void Scan() { |
| 415 | if (is_parsing_json_) { |
| 416 | ScanJson(); |
| 417 | } else { |
| 418 | ScanJavaScript(); |
| 419 | } |
| 420 | } |
| 421 | |
| 422 | // Scans a single JavaScript token. |
| 423 | void ScanJavaScript(); |
| 424 | |
| 425 | // Scan a single JSON token. The JSON lexical grammar is specified in the |
| 426 | // ECMAScript 5 standard, section 15.12.1.1. |
| 427 | // Recognizes all of the single-character tokens directly, or calls a function |
| 428 | // to scan a number, string or identifier literal. |
| 429 | // The only allowed whitespace characters between tokens are tab, |
| 430 | // carrige-return, newline and space. |
| 431 | void ScanJson(); |
| 432 | |
| 433 | // A JSON number (production JSONNumber) is a subset of the valid JavaScript |
| 434 | // decimal number literals. |
| 435 | // It includes an optional minus sign, must have at least one |
| 436 | // digit before and after a decimal point, may not have prefixed zeros (unless |
| 437 | // the integer part is zero), and may include an exponent part (e.g., "e-10"). |
| 438 | // Hexadecimal and octal numbers are not allowed. |
| 439 | Token::Value ScanJsonNumber(); |
| 440 | // A JSON string (production JSONString) is subset of valid JavaScript string |
| 441 | // literals. The string must only be double-quoted (not single-quoted), and |
| 442 | // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and |
| 443 | // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. |
| 444 | Token::Value ScanJsonString(); |
| 445 | // Used to recognizes one of the literals "true", "false", or "null". These |
| 446 | // are the only valid JSON identifiers (productions JSONBooleanLiteral, |
| 447 | // JSONNullLiteral). |
| 448 | Token::Value ScanJsonIdentifier(const char* text, Token::Value token); |
| 449 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 450 | void ScanDecimalDigits(); |
| 451 | Token::Value ScanNumber(bool seen_period); |
| 452 | Token::Value ScanIdentifier(); |
| 453 | uc32 ScanHexEscape(uc32 c, int length); |
| 454 | uc32 ScanOctalEscape(uc32 c, int length); |
| 455 | void ScanEscape(); |
| 456 | Token::Value ScanString(); |
| 457 | |
| 458 | // Scans a possible HTML comment -- begins with '<!'. |
| 459 | Token::Value ScanHtmlComment(); |
| 460 | |
| 461 | // Return the current source position. |
| 462 | int source_pos() { |
| 463 | return source_->pos() - kCharacterLookaheadBufferSize + position_; |
| 464 | } |
| 465 | |
| 466 | // Decodes a unicode escape-sequence which is part of an identifier. |
| 467 | // If the escape sequence cannot be decoded the result is kBadRune. |
| 468 | uc32 ScanIdentifierUnicodeEscape(); |
| 469 | }; |
| 470 | |
| 471 | } } // namespace v8::internal |
| 472 | |
| 473 | #endif // V8_SCANNER_H_ |