Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 1 | // Copyright 2006-2008 the V8 project authors. All rights reserved. |
| 2 | // Redistribution and use in source and binary forms, with or without |
| 3 | // modification, are permitted provided that the following conditions are |
| 4 | // met: |
| 5 | // |
| 6 | // * Redistributions of source code must retain the above copyright |
| 7 | // notice, this list of conditions and the following disclaimer. |
| 8 | // * Redistributions in binary form must reproduce the above |
| 9 | // copyright notice, this list of conditions and the following |
| 10 | // disclaimer in the documentation and/or other materials provided |
| 11 | // with the distribution. |
| 12 | // * Neither the name of Google Inc. nor the names of its |
| 13 | // contributors may be used to endorse or promote products derived |
| 14 | // from this software without specific prior written permission. |
| 15 | // |
| 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 | |
| 28 | #ifndef V8_SCANNER_H_ |
| 29 | #define V8_SCANNER_H_ |
| 30 | |
| 31 | #include "token.h" |
| 32 | #include "char-predicates-inl.h" |
| 33 | |
| 34 | namespace v8 { |
| 35 | namespace internal { |
| 36 | |
| 37 | |
| 38 | class UTF8Buffer { |
| 39 | public: |
| 40 | UTF8Buffer(); |
| 41 | ~UTF8Buffer(); |
| 42 | |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 43 | inline void AddChar(uc32 c) { |
| 44 | if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { |
| 45 | buffer_.Add(static_cast<char>(c)); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 46 | } else { |
| 47 | AddCharSlow(c); |
| 48 | } |
| 49 | } |
| 50 | |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 51 | void StartLiteral() { |
| 52 | buffer_.StartSequence(); |
| 53 | } |
| 54 | |
| 55 | Vector<const char> EndLiteral() { |
| 56 | buffer_.Add(kEndMarker); |
| 57 | Vector<char> sequence = buffer_.EndSequence(); |
| 58 | return Vector<const char>(sequence.start(), sequence.length()); |
| 59 | } |
| 60 | |
| 61 | void DropLiteral() { |
| 62 | buffer_.DropSequence(); |
| 63 | } |
| 64 | |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 65 | void Reset() { |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 66 | buffer_.Reset(); |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 67 | } |
| 68 | |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 69 | // The end marker added after a parsed literal. |
| 70 | // Using zero allows the usage of strlen and similar functions on |
| 71 | // identifiers and numbers (but not strings, since they may contain zero |
| 72 | // bytes). |
| 73 | // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside |
| 74 | // an utf-8 string. This requires changes in all places that uses |
| 75 | // str-functions on the literals, but allows a single pointer to represent |
| 76 | // the literal, even if it contains embedded zeros. |
| 77 | static const char kEndMarker = '\x00'; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 78 | private: |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 79 | static const int kInitialCapacity = 256; |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 80 | SequenceCollector<char, 4> buffer_; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 81 | |
| 82 | void AddCharSlow(uc32 c); |
| 83 | }; |
| 84 | |
| 85 | |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 86 | // Interface through which the scanner reads characters from the input source. |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 87 | class UTF16Buffer { |
| 88 | public: |
| 89 | UTF16Buffer(); |
| 90 | virtual ~UTF16Buffer() {} |
| 91 | |
| 92 | virtual void PushBack(uc32 ch) = 0; |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 93 | // Returns a value < 0 when the buffer end is reached. |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 94 | virtual uc32 Advance() = 0; |
| 95 | virtual void SeekForward(int pos) = 0; |
| 96 | |
| 97 | int pos() const { return pos_; } |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 98 | |
| 99 | protected: |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 100 | int pos_; // Current position in the buffer. |
| 101 | int end_; // Position where scanning should stop (EOF). |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 102 | }; |
| 103 | |
| 104 | |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 105 | // UTF16 buffer to read characters from a character stream. |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 106 | class CharacterStreamUTF16Buffer: public UTF16Buffer { |
| 107 | public: |
| 108 | CharacterStreamUTF16Buffer(); |
| 109 | virtual ~CharacterStreamUTF16Buffer() {} |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 110 | void Initialize(Handle<String> data, |
| 111 | unibrow::CharacterStream* stream, |
| 112 | int start_position, |
| 113 | int end_position); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 114 | virtual void PushBack(uc32 ch); |
| 115 | virtual uc32 Advance(); |
| 116 | virtual void SeekForward(int pos); |
| 117 | |
| 118 | private: |
| 119 | List<uc32> pushback_buffer_; |
| 120 | uc32 last_; |
| 121 | unibrow::CharacterStream* stream_; |
| 122 | |
| 123 | List<uc32>* pushback_buffer() { return &pushback_buffer_; } |
| 124 | }; |
| 125 | |
| 126 | |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 127 | // UTF16 buffer to read characters from an external string. |
| 128 | template <typename StringType, typename CharType> |
| 129 | class ExternalStringUTF16Buffer: public UTF16Buffer { |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 130 | public: |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 131 | ExternalStringUTF16Buffer(); |
| 132 | virtual ~ExternalStringUTF16Buffer() {} |
| 133 | void Initialize(Handle<StringType> data, |
| 134 | int start_position, |
| 135 | int end_position); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 136 | virtual void PushBack(uc32 ch); |
| 137 | virtual uc32 Advance(); |
| 138 | virtual void SeekForward(int pos); |
| 139 | |
| 140 | private: |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 141 | const CharType* raw_data_; // Pointer to the actual array of characters. |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 142 | }; |
| 143 | |
| 144 | |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 145 | class KeywordMatcher { |
| 146 | // Incrementally recognize keywords. |
| 147 | // |
| 148 | // Recognized keywords: |
| 149 | // break case catch const* continue debugger* default delete do else |
| 150 | // finally false for function if in instanceof native* new null |
| 151 | // return switch this throw true try typeof var void while with |
| 152 | // |
| 153 | // *: Actually "future reserved keywords". These are the only ones we |
| 154 | // recognized, the remaining are allowed as identifiers. |
| 155 | public: |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 156 | KeywordMatcher() |
| 157 | : state_(INITIAL), |
| 158 | token_(Token::IDENTIFIER), |
| 159 | keyword_(NULL), |
| 160 | counter_(0), |
| 161 | keyword_token_(Token::ILLEGAL) {} |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 162 | |
| 163 | Token::Value token() { return token_; } |
| 164 | |
| 165 | inline void AddChar(uc32 input) { |
| 166 | if (state_ != UNMATCHABLE) { |
| 167 | Step(input); |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | void Fail() { |
| 172 | token_ = Token::IDENTIFIER; |
| 173 | state_ = UNMATCHABLE; |
| 174 | } |
| 175 | |
| 176 | private: |
| 177 | enum State { |
| 178 | UNMATCHABLE, |
| 179 | INITIAL, |
| 180 | KEYWORD_PREFIX, |
| 181 | KEYWORD_MATCHED, |
| 182 | C, |
| 183 | CA, |
| 184 | CO, |
| 185 | CON, |
| 186 | D, |
| 187 | DE, |
| 188 | F, |
| 189 | I, |
| 190 | IN, |
| 191 | N, |
| 192 | T, |
| 193 | TH, |
| 194 | TR, |
| 195 | V, |
| 196 | W |
| 197 | }; |
| 198 | |
| 199 | struct FirstState { |
| 200 | const char* keyword; |
| 201 | State state; |
| 202 | Token::Value token; |
| 203 | }; |
| 204 | |
| 205 | // Range of possible first characters of a keyword. |
| 206 | static const unsigned int kFirstCharRangeMin = 'b'; |
| 207 | static const unsigned int kFirstCharRangeMax = 'w'; |
| 208 | static const unsigned int kFirstCharRangeLength = |
| 209 | kFirstCharRangeMax - kFirstCharRangeMin + 1; |
| 210 | // State map for first keyword character range. |
| 211 | static FirstState first_states_[kFirstCharRangeLength]; |
| 212 | |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 213 | // If input equals keyword's character at position, continue matching keyword |
| 214 | // from that position. |
| 215 | inline bool MatchKeywordStart(uc32 input, |
| 216 | const char* keyword, |
| 217 | int position, |
| 218 | Token::Value token_if_match) { |
| 219 | if (input == keyword[position]) { |
| 220 | state_ = KEYWORD_PREFIX; |
| 221 | this->keyword_ = keyword; |
| 222 | this->counter_ = position + 1; |
| 223 | this->keyword_token_ = token_if_match; |
| 224 | return true; |
| 225 | } |
| 226 | return false; |
| 227 | } |
| 228 | |
| 229 | // If input equals match character, transition to new state and return true. |
| 230 | inline bool MatchState(uc32 input, char match, State new_state) { |
| 231 | if (input == match) { |
| 232 | state_ = new_state; |
| 233 | return true; |
| 234 | } |
| 235 | return false; |
| 236 | } |
| 237 | |
| 238 | inline bool MatchKeyword(uc32 input, |
| 239 | char match, |
| 240 | State new_state, |
| 241 | Token::Value keyword_token) { |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 242 | if (input != match) { |
| 243 | return false; |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 244 | } |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 245 | state_ = new_state; |
| 246 | token_ = keyword_token; |
| 247 | return true; |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 248 | } |
| 249 | |
| 250 | void Step(uc32 input); |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 251 | |
| 252 | // Current state. |
| 253 | State state_; |
| 254 | // Token for currently added characters. |
| 255 | Token::Value token_; |
| 256 | |
| 257 | // Matching a specific keyword string (there is only one possible valid |
| 258 | // keyword with the current prefix). |
| 259 | const char* keyword_; |
| 260 | int counter_; |
| 261 | Token::Value keyword_token_; |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 262 | }; |
| 263 | |
| 264 | |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame] | 265 | enum ParserMode { PARSE, PREPARSE }; |
| 266 | enum ParserLanguage { JAVASCRIPT, JSON }; |
| 267 | |
| 268 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 269 | class Scanner { |
| 270 | public: |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 271 | typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; |
| 272 | |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 273 | class LiteralScope { |
| 274 | public: |
| 275 | explicit LiteralScope(Scanner* self); |
| 276 | ~LiteralScope(); |
| 277 | void Complete(); |
| 278 | |
| 279 | private: |
| 280 | Scanner* scanner_; |
| 281 | bool complete_; |
| 282 | }; |
| 283 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 284 | // Construction |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame] | 285 | explicit Scanner(ParserMode parse_mode); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 286 | |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 287 | // Initialize the Scanner to scan source. |
| 288 | void Initialize(Handle<String> source, |
| 289 | ParserLanguage language); |
| 290 | void Initialize(Handle<String> source, |
| 291 | unibrow::CharacterStream* stream, |
| 292 | ParserLanguage language); |
| 293 | void Initialize(Handle<String> source, |
| 294 | int start_position, int end_position, |
| 295 | ParserLanguage language); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 296 | |
| 297 | // Returns the next token. |
| 298 | Token::Value Next(); |
| 299 | |
| 300 | // One token look-ahead (past the token returned by Next()). |
| 301 | Token::Value peek() const { return next_.token; } |
| 302 | |
| 303 | // Returns true if there was a line terminator before the peek'ed token. |
| 304 | bool has_line_terminator_before_next() const { |
| 305 | return has_line_terminator_before_next_; |
| 306 | } |
| 307 | |
| 308 | struct Location { |
| 309 | Location(int b, int e) : beg_pos(b), end_pos(e) { } |
| 310 | Location() : beg_pos(0), end_pos(0) { } |
| 311 | int beg_pos; |
| 312 | int end_pos; |
| 313 | }; |
| 314 | |
| 315 | // Returns the location information for the current token |
| 316 | // (the token returned by Next()). |
| 317 | Location location() const { return current_.location; } |
| 318 | Location peek_location() const { return next_.location; } |
| 319 | |
| 320 | // Returns the literal string, if any, for the current token (the |
| 321 | // token returned by Next()). The string is 0-terminated and in |
| 322 | // UTF-8 format; they may contain 0-characters. Literal strings are |
| 323 | // collected for identifiers, strings, and numbers. |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 324 | // These functions only give the correct result if the literal |
| 325 | // was scanned between calls to StartLiteral() and TerminateLiteral(). |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 326 | const char* literal_string() const { |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 327 | return current_.literal_chars.start(); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 328 | } |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 329 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 330 | int literal_length() const { |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 331 | // Excluding terminal '\x00' added by TerminateLiteral(). |
| 332 | return current_.literal_chars.length() - 1; |
| 333 | } |
| 334 | |
| 335 | Vector<const char> literal() const { |
| 336 | return Vector<const char>(literal_string(), literal_length()); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 337 | } |
| 338 | |
| 339 | // Returns the literal string for the next token (the token that |
| 340 | // would be returned if Next() were called). |
| 341 | const char* next_literal_string() const { |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 342 | return next_.literal_chars.start(); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 343 | } |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 344 | |
| 345 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 346 | // Returns the length of the next token (that would be returned if |
| 347 | // Next() were called). |
| 348 | int next_literal_length() const { |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 349 | // Excluding terminal '\x00' added by TerminateLiteral(). |
| 350 | return next_.literal_chars.length() - 1; |
Steve Block | d0582a6 | 2009-12-15 09:54:21 +0000 | [diff] [blame] | 351 | } |
| 352 | |
| 353 | Vector<const char> next_literal() const { |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 354 | return Vector<const char>(next_literal_string(), next_literal_length()); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 355 | } |
| 356 | |
| 357 | // Scans the input as a regular expression pattern, previous |
| 358 | // character(s) must be /(=). Returns true if a pattern is scanned. |
| 359 | bool ScanRegExpPattern(bool seen_equal); |
| 360 | // Returns true if regexp flags are scanned (always since flags can |
| 361 | // be empty). |
| 362 | bool ScanRegExpFlags(); |
| 363 | |
| 364 | // Seek forward to the given position. This operation does not |
| 365 | // work in general, for instance when there are pushed back |
| 366 | // characters, but works for seeking forward until simple delimiter |
| 367 | // tokens, which is what it is used for. |
| 368 | void SeekForward(int pos); |
| 369 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 370 | bool stack_overflow() { return stack_overflow_; } |
| 371 | |
| 372 | static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; } |
| 373 | |
| 374 | // Tells whether the buffer contains an identifier (no escapes). |
| 375 | // Used for checking if a property name is an identifier. |
| 376 | static bool IsIdentifier(unibrow::CharacterStream* buffer); |
| 377 | |
| 378 | static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; |
| 379 | static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; |
| 380 | static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; |
| 381 | static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; |
| 382 | |
| 383 | static const int kCharacterLookaheadBufferSize = 1; |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 384 | static const int kNoEndPosition = 1; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 385 | |
| 386 | private: |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 387 | // The current and look-ahead token. |
| 388 | struct TokenDesc { |
| 389 | Token::Value token; |
| 390 | Location location; |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 391 | Vector<const char> literal_chars; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 392 | }; |
| 393 | |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 394 | void Init(Handle<String> source, |
| 395 | unibrow::CharacterStream* stream, |
| 396 | int start_position, int end_position, |
| 397 | ParserLanguage language); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 398 | |
| 399 | // Literal buffer support |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 400 | inline void StartLiteral(); |
| 401 | inline void AddChar(uc32 ch); |
| 402 | inline void AddCharAdvance(); |
| 403 | inline void TerminateLiteral(); |
| 404 | // Stops scanning of a literal, e.g., due to an encountered error. |
| 405 | inline void DropLiteral(); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 406 | |
| 407 | // Low-level scanning support. |
| 408 | void Advance() { c0_ = source_->Advance(); } |
| 409 | void PushBack(uc32 ch) { |
| 410 | source_->PushBack(ch); |
| 411 | c0_ = ch; |
| 412 | } |
| 413 | |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame] | 414 | bool SkipWhiteSpace() { |
| 415 | if (is_parsing_json_) { |
| 416 | return SkipJsonWhiteSpace(); |
| 417 | } else { |
| 418 | return SkipJavaScriptWhiteSpace(); |
| 419 | } |
| 420 | } |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 421 | |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame] | 422 | bool SkipJavaScriptWhiteSpace(); |
| 423 | bool SkipJsonWhiteSpace(); |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 424 | Token::Value SkipSingleLineComment(); |
| 425 | Token::Value SkipMultiLineComment(); |
| 426 | |
| 427 | inline Token::Value Select(Token::Value tok); |
| 428 | inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); |
| 429 | |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame] | 430 | inline void Scan() { |
| 431 | if (is_parsing_json_) { |
| 432 | ScanJson(); |
| 433 | } else { |
| 434 | ScanJavaScript(); |
| 435 | } |
| 436 | } |
| 437 | |
| 438 | // Scans a single JavaScript token. |
| 439 | void ScanJavaScript(); |
| 440 | |
| 441 | // Scan a single JSON token. The JSON lexical grammar is specified in the |
| 442 | // ECMAScript 5 standard, section 15.12.1.1. |
| 443 | // Recognizes all of the single-character tokens directly, or calls a function |
| 444 | // to scan a number, string or identifier literal. |
| 445 | // The only allowed whitespace characters between tokens are tab, |
| 446 | // carrige-return, newline and space. |
| 447 | void ScanJson(); |
| 448 | |
| 449 | // A JSON number (production JSONNumber) is a subset of the valid JavaScript |
| 450 | // decimal number literals. |
| 451 | // It includes an optional minus sign, must have at least one |
| 452 | // digit before and after a decimal point, may not have prefixed zeros (unless |
| 453 | // the integer part is zero), and may include an exponent part (e.g., "e-10"). |
| 454 | // Hexadecimal and octal numbers are not allowed. |
| 455 | Token::Value ScanJsonNumber(); |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 456 | |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame] | 457 | // A JSON string (production JSONString) is subset of valid JavaScript string |
| 458 | // literals. The string must only be double-quoted (not single-quoted), and |
| 459 | // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and |
| 460 | // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. |
| 461 | Token::Value ScanJsonString(); |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 462 | |
Leon Clarke | 4515c47 | 2010-02-03 11:58:03 +0000 | [diff] [blame] | 463 | // Used to recognizes one of the literals "true", "false", or "null". These |
| 464 | // are the only valid JSON identifiers (productions JSONBooleanLiteral, |
| 465 | // JSONNullLiteral). |
| 466 | Token::Value ScanJsonIdentifier(const char* text, Token::Value token); |
| 467 | |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 468 | void ScanDecimalDigits(); |
| 469 | Token::Value ScanNumber(bool seen_period); |
| 470 | Token::Value ScanIdentifier(); |
| 471 | uc32 ScanHexEscape(uc32 c, int length); |
| 472 | uc32 ScanOctalEscape(uc32 c, int length); |
| 473 | void ScanEscape(); |
| 474 | Token::Value ScanString(); |
| 475 | |
| 476 | // Scans a possible HTML comment -- begins with '<!'. |
| 477 | Token::Value ScanHtmlComment(); |
| 478 | |
| 479 | // Return the current source position. |
| 480 | int source_pos() { |
Steve Block | 6ded16b | 2010-05-10 14:33:55 +0100 | [diff] [blame] | 481 | return source_->pos() - kCharacterLookaheadBufferSize; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 482 | } |
| 483 | |
| 484 | // Decodes a unicode escape-sequence which is part of an identifier. |
| 485 | // If the escape sequence cannot be decoded the result is kBadRune. |
| 486 | uc32 ScanIdentifierUnicodeEscape(); |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 487 | |
| 488 | TokenDesc current_; // desc for current token (as returned by Next()) |
| 489 | TokenDesc next_; // desc for next token (one token look-ahead) |
| 490 | bool has_line_terminator_before_next_; |
| 491 | bool is_pre_parsing_; |
| 492 | bool is_parsing_json_; |
| 493 | |
| 494 | // Different UTF16 buffers used to pull characters from. Based on input one of |
| 495 | // these will be initialized as the actual data source. |
| 496 | CharacterStreamUTF16Buffer char_stream_buffer_; |
| 497 | ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> |
| 498 | two_byte_string_buffer_; |
| 499 | ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; |
| 500 | |
| 501 | // Source. Will point to one of the buffers declared above. |
| 502 | UTF16Buffer* source_; |
| 503 | |
| 504 | // Used to convert the source string into a character stream when a stream |
| 505 | // is not passed to the scanner. |
| 506 | SafeStringInputBuffer safe_string_input_buffer_; |
| 507 | |
| 508 | // Buffer to hold literal values (identifiers, strings, numbers) |
Kristian Monsen | 80d68ea | 2010-09-08 11:05:35 +0100 | [diff] [blame^] | 509 | // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. |
| 510 | UTF8Buffer literal_buffer_; |
Kristian Monsen | 9dcf7e2 | 2010-06-28 14:14:28 +0100 | [diff] [blame] | 511 | |
| 512 | bool stack_overflow_; |
| 513 | static StaticResource<Utf8Decoder> utf8_decoder_; |
| 514 | |
| 515 | // One Unicode character look-ahead; c0_ < 0 at the end of the input. |
| 516 | uc32 c0_; |
Steve Block | a7e24c1 | 2009-10-30 11:49:00 +0000 | [diff] [blame] | 517 | }; |
| 518 | |
| 519 | } } // namespace v8::internal |
| 520 | |
| 521 | #endif // V8_SCANNER_H_ |