blob: f5fe7f7cefe295a33bd2590220c433735d39d64e [file] [log] [blame]
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +00001// Copyright 2010 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28// Features shared by parsing and pre-parsing scanners.
29
30#ifndef V8_SCANNER_BASE_H_
31#define V8_SCANNER_BASE_H_
32
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000033#include "globals.h"
34#include "checks.h"
35#include "allocation.h"
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +000036#include "token.h"
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000037#include "unicode-inl.h"
38#include "char-predicates.h"
39#include "utils.h"
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000040#include "list-inl.h"
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +000041
42namespace v8 {
43namespace internal {
44
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000045// Returns the value (0 .. 15) of a hexadecimal character c.
46// If c is not a legal hexadecimal character, returns a value < 0.
47inline int HexValue(uc32 c) {
48 c -= '0';
49 if (static_cast<unsigned>(c) <= 9) return c;
50 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
51 if (static_cast<unsigned>(c) <= 5) return c + 10;
52 return -1;
53}
54
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000055
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000056// ---------------------------------------------------------------------
57// Buffered stream of characters, using an internal UC16 buffer.
58
59class UC16CharacterStream {
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000060 public:
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000061 UC16CharacterStream() : pos_(0) { }
62 virtual ~UC16CharacterStream() { }
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000063
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000064 // Returns and advances past the next UC16 character in the input
65 // stream. If there are no more characters, it returns a negative
66 // value.
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +000067 inline uc32 Advance() {
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000068 if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
69 pos_++;
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +000070 return static_cast<uc32>(*(buffer_cursor_++));
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000071 }
72 // Note: currently the following increment is necessary to avoid a
73 // parser problem! The scanner treats the final kEndOfInput as
74 // a character with a position, and does math relative to that
75 // position.
76 pos_++;
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000077
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000078 return kEndOfInput;
79 }
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000080
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000081 // Return the current position in the character stream.
82 // Starts at zero.
83 inline unsigned pos() const { return pos_; }
84
85 // Skips forward past the next character_count UC16 characters
86 // in the input, or until the end of input if that comes sooner.
87 // Returns the number of characters actually skipped. If less
88 // than character_count,
89 inline unsigned SeekForward(unsigned character_count) {
90 unsigned buffered_chars =
91 static_cast<unsigned>(buffer_end_ - buffer_cursor_);
92 if (character_count <= buffered_chars) {
93 buffer_cursor_ += character_count;
94 pos_ += character_count;
95 return character_count;
96 }
97 return SlowSeekForward(character_count);
98 }
99
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +0000100 // Pushes back the most recently read UC16 character (or negative
101 // value if at end of input), i.e., the value returned by the most recent
102 // call to Advance.
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000103 // Must not be used right after calling SeekForward.
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +0000104 virtual void PushBack(int32_t character) = 0;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000105
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000106 protected:
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +0000107 static const uc32 kEndOfInput = -1;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000108
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000109 // Ensures that the buffer_cursor_ points to the character at
110 // position pos_ of the input, if possible. If the position
111 // is at or after the end of the input, return false. If there
112 // are more characters available, return true.
113 virtual bool ReadBlock() = 0;
114 virtual unsigned SlowSeekForward(unsigned character_count) = 0;
115
116 const uc16* buffer_cursor_;
117 const uc16* buffer_end_;
118 unsigned pos_;
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000119};
120
121
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000122// ---------------------------------------------------------------------
123// Constants used by scanners.
124
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000125class ScannerConstants : AllStatic {
126 public:
127 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
128
129 static StaticResource<Utf8Decoder>* utf8_decoder() {
130 return &utf8_decoder_;
131 }
132
133 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
134 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
135 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
136 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
137
138 static bool IsIdentifier(unibrow::CharacterStream* buffer);
139
140 private:
141 static StaticResource<Utf8Decoder> utf8_decoder_;
142};
143
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000144// ----------------------------------------------------------------------------
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000145// LiteralBuffer - Collector of chars of literals.
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000146
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000147class LiteralBuffer {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000148 public:
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000149 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000150
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000151 ~LiteralBuffer() {
152 if (backing_store_.length() > 0) {
153 backing_store_.Dispose();
154 }
155 }
156
157 inline void AddChar(uc16 character) {
158 if (position_ >= backing_store_.length()) ExpandBuffer();
159 if (is_ascii_) {
160 if (character < kMaxAsciiCharCodeU) {
161 backing_store_[position_] = static_cast<byte>(character);
162 position_ += kASCIISize;
163 return;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000164 }
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000165 ConvertToUC16();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000166 }
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000167 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
168 position_ += kUC16Size;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000169 }
170
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000171 bool is_ascii() { return is_ascii_; }
172
173 Vector<const uc16> uc16_literal() {
174 ASSERT(!is_ascii_);
175 ASSERT((position_ & 0x1) == 0);
176 return Vector<const uc16>(
177 reinterpret_cast<const uc16*>(backing_store_.start()),
178 position_ >> 1);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000179 }
180
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000181 Vector<const char> ascii_literal() {
182 ASSERT(is_ascii_);
183 return Vector<const char>(
184 reinterpret_cast<const char*>(backing_store_.start()),
185 position_);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000186 }
187
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000188 int length() {
189 return is_ascii_ ? position_ : (position_ >> 1);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000190 }
191
192 void Reset() {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000193 position_ = 0;
194 is_ascii_ = true;
195 }
196 private:
197 static const int kInitialCapacity = 16;
198 static const int kGrowthFactory = 4;
199 static const int kMinConversionSlack = 256;
200 static const int kMaxGrowth = 1 * MB;
201 inline int NewCapacity(int min_capacity) {
202 int capacity = Max(min_capacity, backing_store_.length());
203 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
204 return new_capacity;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000205 }
206
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000207 void ExpandBuffer() {
208 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
209 memcpy(new_store.start(), backing_store_.start(), position_);
210 backing_store_.Dispose();
211 backing_store_ = new_store;
212 }
213
214 void ConvertToUC16() {
215 ASSERT(is_ascii_);
216 Vector<byte> new_store;
217 int new_content_size = position_ * kUC16Size;
lrn@chromium.org5d00b602011-01-05 09:51:43 +0000218 if (new_content_size >= backing_store_.length()) {
219 // Ensure room for all currently read characters as UC16 as well
220 // as the character about to be stored.
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000221 new_store = Vector<byte>::New(NewCapacity(new_content_size));
222 } else {
223 new_store = backing_store_;
224 }
225 char* src = reinterpret_cast<char*>(backing_store_.start());
226 uc16* dst = reinterpret_cast<uc16*>(new_store.start());
227 for (int i = position_ - 1; i >= 0; i--) {
228 dst[i] = src[i];
229 }
230 if (new_store.start() != backing_store_.start()) {
231 backing_store_.Dispose();
232 backing_store_ = new_store;
233 }
234 position_ = new_content_size;
235 is_ascii_ = false;
236 }
237
238 bool is_ascii_;
239 int position_;
240 Vector<byte> backing_store_;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000241};
242
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000243
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000244// ----------------------------------------------------------------------------
245// Scanner base-class.
246
247// Generic functionality used by both JSON and JavaScript scanners.
248class Scanner {
249 public:
ager@chromium.org0ee099b2011-01-25 14:06:47 +0000250 // -1 is outside of the range of any real source code.
251 static const int kNoOctalLocation = -1;
252
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000253 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
254
255 class LiteralScope {
256 public:
257 explicit LiteralScope(Scanner* self);
258 ~LiteralScope();
259 void Complete();
260
261 private:
262 Scanner* scanner_;
263 bool complete_;
264 };
265
266 Scanner();
267
268 // Returns the current token again.
269 Token::Value current_token() { return current_.token; }
270
271 // One token look-ahead (past the token returned by Next()).
272 Token::Value peek() const { return next_.token; }
273
274 struct Location {
275 Location(int b, int e) : beg_pos(b), end_pos(e) { }
276 Location() : beg_pos(0), end_pos(0) { }
ager@chromium.org378b34e2011-01-28 08:04:38 +0000277
278 bool IsValid() const {
279 return beg_pos >= 0 && end_pos >= beg_pos;
280 }
281
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000282 int beg_pos;
283 int end_pos;
284 };
285
ager@chromium.org378b34e2011-01-28 08:04:38 +0000286 static Location NoLocation() {
287 return Location(-1, -1);
288 }
289
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000290 // Returns the location information for the current token
291 // (the token returned by Next()).
292 Location location() const { return current_.location; }
293 Location peek_location() const { return next_.location; }
294
ager@chromium.org0ee099b2011-01-25 14:06:47 +0000295 // Returns the location of the last seen octal literal
296 int octal_position() const { return octal_pos_; }
297 void clear_octal_position() { octal_pos_ = -1; }
298
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000299 // Returns the literal string, if any, for the current token (the
300 // token returned by Next()). The string is 0-terminated and in
301 // UTF-8 format; they may contain 0-characters. Literal strings are
302 // collected for identifiers, strings, and numbers.
303 // These functions only give the correct result if the literal
304 // was scanned between calls to StartLiteral() and TerminateLiteral().
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000305 bool is_literal_ascii() {
306 ASSERT_NOT_NULL(current_.literal_chars);
307 return current_.literal_chars->is_ascii();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000308 }
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000309 Vector<const char> literal_ascii_string() {
310 ASSERT_NOT_NULL(current_.literal_chars);
311 return current_.literal_chars->ascii_literal();
312 }
313 Vector<const uc16> literal_uc16_string() {
314 ASSERT_NOT_NULL(current_.literal_chars);
315 return current_.literal_chars->uc16_literal();
316 }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000317 int literal_length() const {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000318 ASSERT_NOT_NULL(current_.literal_chars);
319 return current_.literal_chars->length();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000320 }
321
322 // Returns the literal string for the next token (the token that
323 // would be returned if Next() were called).
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000324 bool is_next_literal_ascii() {
325 ASSERT_NOT_NULL(next_.literal_chars);
326 return next_.literal_chars->is_ascii();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000327 }
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000328 Vector<const char> next_literal_ascii_string() {
329 ASSERT_NOT_NULL(next_.literal_chars);
330 return next_.literal_chars->ascii_literal();
331 }
332 Vector<const uc16> next_literal_uc16_string() {
333 ASSERT_NOT_NULL(next_.literal_chars);
334 return next_.literal_chars->uc16_literal();
335 }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000336 int next_literal_length() const {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000337 ASSERT_NOT_NULL(next_.literal_chars);
338 return next_.literal_chars->length();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000339 }
340
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000341 static const int kCharacterLookaheadBufferSize = 1;
342
343 protected:
344 // The current and look-ahead token.
345 struct TokenDesc {
346 Token::Value token;
347 Location location;
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000348 LiteralBuffer* literal_chars;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000349 };
350
351 // Call this after setting source_ to the input.
352 void Init() {
353 // Set c0_ (one character ahead)
354 ASSERT(kCharacterLookaheadBufferSize == 1);
355 Advance();
356 // Initialize current_ to not refer to a literal.
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000357 current_.literal_chars = NULL;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000358 }
359
360 // Literal buffer support
361 inline void StartLiteral() {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000362 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
363 &literal_buffer2_ : &literal_buffer1_;
364 free_buffer->Reset();
365 next_.literal_chars = free_buffer;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000366 }
367
368 inline void AddLiteralChar(uc32 c) {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000369 ASSERT_NOT_NULL(next_.literal_chars);
370 next_.literal_chars->AddChar(c);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000371 }
372
373 // Complete scanning of a literal.
374 inline void TerminateLiteral() {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000375 // Does nothing in the current implementation.
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000376 }
377
378 // Stops scanning of a literal and drop the collected characters,
379 // e.g., due to an encountered error.
380 inline void DropLiteral() {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000381 next_.literal_chars = NULL;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000382 }
383
384 inline void AddLiteralCharAdvance() {
385 AddLiteralChar(c0_);
386 Advance();
387 }
388
389 // Low-level scanning support.
390 void Advance() { c0_ = source_->Advance(); }
391 void PushBack(uc32 ch) {
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000392 source_->PushBack(c0_);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000393 c0_ = ch;
394 }
395
396 inline Token::Value Select(Token::Value tok) {
397 Advance();
398 return tok;
399 }
400
401 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
402 Advance();
403 if (c0_ == next) {
404 Advance();
405 return then;
406 } else {
407 return else_;
408 }
409 }
410
411 uc32 ScanHexEscape(uc32 c, int length);
kmillikin@chromium.org31b12772011-02-02 16:08:26 +0000412
413 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000414 uc32 ScanOctalEscape(uc32 c, int length);
415
416 // Return the current source position.
417 int source_pos() {
418 return source_->pos() - kCharacterLookaheadBufferSize;
419 }
420
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000421 // Buffers collecting literal strings, numbers, etc.
422 LiteralBuffer literal_buffer1_;
423 LiteralBuffer literal_buffer2_;
424
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000425 TokenDesc current_; // desc for current token (as returned by Next())
426 TokenDesc next_; // desc for next token (one token look-ahead)
427
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000428 // Input stream. Must be initialized to an UC16CharacterStream.
429 UC16CharacterStream* source_;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000430
ager@chromium.org0ee099b2011-01-25 14:06:47 +0000431 // Start position of the octal literal last scanned.
432 int octal_pos_;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000433
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000434 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
435 uc32 c0_;
436};
437
438// ----------------------------------------------------------------------------
439// JavaScriptScanner - base logic for JavaScript scanning.
440
441class JavaScriptScanner : public Scanner {
442 public:
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000443 // A LiteralScope that disables recording of some types of JavaScript
444 // literals. If the scanner is configured to not record the specific
445 // type of literal, the scope will not call StartLiteral.
446 class LiteralScope {
447 public:
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000448 explicit LiteralScope(JavaScriptScanner* self)
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000449 : scanner_(self), complete_(false) {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000450 scanner_->StartLiteral();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000451 }
452 ~LiteralScope() {
453 if (!complete_) scanner_->DropLiteral();
454 }
455 void Complete() {
456 scanner_->TerminateLiteral();
457 complete_ = true;
458 }
459
460 private:
461 JavaScriptScanner* scanner_;
462 bool complete_;
463 };
464
465 JavaScriptScanner();
466
467 // Returns the next token.
468 Token::Value Next();
469
470 // Returns true if there was a line terminator before the peek'ed token.
471 bool has_line_terminator_before_next() const {
472 return has_line_terminator_before_next_;
473 }
474
475 // Scans the input as a regular expression pattern, previous
476 // character(s) must be /(=). Returns true if a pattern is scanned.
477 bool ScanRegExpPattern(bool seen_equal);
478 // Returns true if regexp flags are scanned (always since flags can
479 // be empty).
480 bool ScanRegExpFlags();
481
482 // Tells whether the buffer contains an identifier (no escapes).
483 // Used for checking if a property name is an identifier.
484 static bool IsIdentifier(unibrow::CharacterStream* buffer);
485
486 // Seek forward to the given position. This operation does not
487 // work in general, for instance when there are pushed back
488 // characters, but works for seeking forward until simple delimiter
489 // tokens, which is what it is used for.
490 void SeekForward(int pos);
491
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000492 protected:
493 bool SkipWhiteSpace();
494 Token::Value SkipSingleLineComment();
495 Token::Value SkipMultiLineComment();
496
497 // Scans a single JavaScript token.
498 void Scan();
499
500 void ScanDecimalDigits();
501 Token::Value ScanNumber(bool seen_period);
502 Token::Value ScanIdentifierOrKeyword();
503 Token::Value ScanIdentifierSuffix(LiteralScope* literal);
504
505 void ScanEscape();
506 Token::Value ScanString();
507
508 // Scans a possible HTML comment -- begins with '<!'.
509 Token::Value ScanHtmlComment();
510
511 // Decodes a unicode escape-sequence which is part of an identifier.
512 // If the escape sequence cannot be decoded the result is kBadChar.
513 uc32 ScanIdentifierUnicodeEscape();
514
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000515 bool has_line_terminator_before_next_;
516};
517
518
519// ----------------------------------------------------------------------------
520// Keyword matching state machine.
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000521
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000522class KeywordMatcher {
523// Incrementally recognize keywords.
524//
525// Recognized keywords:
526// break case catch const* continue debugger* default delete do else
527// finally false for function if in instanceof native* new null
528// return switch this throw true try typeof var void while with
529//
530// *: Actually "future reserved keywords". These are the only ones we
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000531// recognize, the remaining are allowed as identifiers.
532// In ES5 strict mode, we should disallow all reserved keywords.
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000533 public:
534 KeywordMatcher()
535 : state_(INITIAL),
536 token_(Token::IDENTIFIER),
537 keyword_(NULL),
538 counter_(0),
539 keyword_token_(Token::ILLEGAL) {}
540
541 Token::Value token() { return token_; }
542
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000543 inline bool AddChar(unibrow::uchar input) {
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000544 if (state_ != UNMATCHABLE) {
545 Step(input);
546 }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000547 return state_ != UNMATCHABLE;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000548 }
549
550 void Fail() {
551 token_ = Token::IDENTIFIER;
552 state_ = UNMATCHABLE;
553 }
554
555 private:
556 enum State {
557 UNMATCHABLE,
558 INITIAL,
559 KEYWORD_PREFIX,
560 KEYWORD_MATCHED,
561 C,
562 CA,
563 CO,
564 CON,
565 D,
566 DE,
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000567 E,
568 EX,
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000569 F,
570 I,
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000571 IM,
572 IMP,
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000573 IN,
574 N,
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000575 P,
576 PR,
577 S,
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000578 T,
579 TH,
580 TR,
581 V,
582 W
583 };
584
585 struct FirstState {
586 const char* keyword;
587 State state;
588 Token::Value token;
589 };
590
591 // Range of possible first characters of a keyword.
592 static const unsigned int kFirstCharRangeMin = 'b';
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000593 static const unsigned int kFirstCharRangeMax = 'y';
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000594 static const unsigned int kFirstCharRangeLength =
595 kFirstCharRangeMax - kFirstCharRangeMin + 1;
596 // State map for first keyword character range.
597 static FirstState first_states_[kFirstCharRangeLength];
598
599 // If input equals keyword's character at position, continue matching keyword
600 // from that position.
601 inline bool MatchKeywordStart(unibrow::uchar input,
602 const char* keyword,
603 int position,
604 Token::Value token_if_match) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000605 if (input != static_cast<unibrow::uchar>(keyword[position])) {
606 return false;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000607 }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000608 state_ = KEYWORD_PREFIX;
609 this->keyword_ = keyword;
610 this->counter_ = position + 1;
611 this->keyword_token_ = token_if_match;
612 return true;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000613 }
614
615 // If input equals match character, transition to new state and return true.
616 inline bool MatchState(unibrow::uchar input, char match, State new_state) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000617 if (input != static_cast<unibrow::uchar>(match)) {
618 return false;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000619 }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000620 state_ = new_state;
621 return true;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000622 }
623
624 inline bool MatchKeyword(unibrow::uchar input,
625 char match,
626 State new_state,
627 Token::Value keyword_token) {
628 if (input != static_cast<unibrow::uchar>(match)) {
629 return false;
630 }
631 state_ = new_state;
632 token_ = keyword_token;
633 return true;
634 }
635
636 void Step(unibrow::uchar input);
637
638 // Current state.
639 State state_;
640 // Token for currently added characters.
641 Token::Value token_;
642
643 // Matching a specific keyword string (there is only one possible valid
644 // keyword with the current prefix).
645 const char* keyword_;
646 int counter_;
647 Token::Value keyword_token_;
648};
649
650
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000651} } // namespace v8::internal
652
653#endif // V8_SCANNER_BASE_H_