blob: 60b97d229c555a9a64d0368623c2cb34f7b084b5 [file] [log] [blame]
Ben Murdoch8b112d22011-06-08 16:22:53 +01001// Copyright 2011 the V8 project authors. All rights reserved.
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -08002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28// Features shared by parsing and pre-parsing scanners.
29
30#ifndef V8_SCANNER_BASE_H_
31#define V8_SCANNER_BASE_H_
32
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -080033#include "globals.h"
34#include "checks.h"
35#include "allocation.h"
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -080036#include "token.h"
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -080037#include "unicode-inl.h"
38#include "char-predicates.h"
39#include "utils.h"
40#include "list-inl.h"
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -080041
42namespace v8 {
43namespace internal {
44
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -080045// Returns the value (0 .. 15) of a hexadecimal character c.
46// If c is not a legal hexadecimal character, returns a value < 0.
47inline int HexValue(uc32 c) {
48 c -= '0';
49 if (static_cast<unsigned>(c) <= 9) return c;
50 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
51 if (static_cast<unsigned>(c) <= 5) return c + 10;
52 return -1;
53}
54
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -080055
Ben Murdochb0fe1622011-05-05 13:52:32 +010056// ---------------------------------------------------------------------
57// Buffered stream of characters, using an internal UC16 buffer.
58
59class UC16CharacterStream {
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -080060 public:
Ben Murdochb0fe1622011-05-05 13:52:32 +010061 UC16CharacterStream() : pos_(0) { }
62 virtual ~UC16CharacterStream() { }
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -080063
Ben Murdochb0fe1622011-05-05 13:52:32 +010064 // Returns and advances past the next UC16 character in the input
65 // stream. If there are no more characters, it returns a negative
66 // value.
Ben Murdochb8e0da22011-05-16 14:20:40 +010067 inline uc32 Advance() {
Ben Murdochb0fe1622011-05-05 13:52:32 +010068 if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
69 pos_++;
Ben Murdochb8e0da22011-05-16 14:20:40 +010070 return static_cast<uc32>(*(buffer_cursor_++));
Ben Murdochb0fe1622011-05-05 13:52:32 +010071 }
72 // Note: currently the following increment is necessary to avoid a
73 // parser problem! The scanner treats the final kEndOfInput as
74 // a character with a position, and does math relative to that
75 // position.
76 pos_++;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -080077
Ben Murdochb0fe1622011-05-05 13:52:32 +010078 return kEndOfInput;
79 }
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -080080
Ben Murdochb0fe1622011-05-05 13:52:32 +010081 // Return the current position in the character stream.
82 // Starts at zero.
83 inline unsigned pos() const { return pos_; }
84
85 // Skips forward past the next character_count UC16 characters
86 // in the input, or until the end of input if that comes sooner.
87 // Returns the number of characters actually skipped. If less
88 // than character_count,
89 inline unsigned SeekForward(unsigned character_count) {
90 unsigned buffered_chars =
91 static_cast<unsigned>(buffer_end_ - buffer_cursor_);
92 if (character_count <= buffered_chars) {
93 buffer_cursor_ += character_count;
94 pos_ += character_count;
95 return character_count;
96 }
97 return SlowSeekForward(character_count);
98 }
99
Ben Murdochb8e0da22011-05-16 14:20:40 +0100100 // Pushes back the most recently read UC16 character (or negative
101 // value if at end of input), i.e., the value returned by the most recent
102 // call to Advance.
Ben Murdochb0fe1622011-05-05 13:52:32 +0100103 // Must not be used right after calling SeekForward.
Ben Murdochb8e0da22011-05-16 14:20:40 +0100104 virtual void PushBack(int32_t character) = 0;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800105
106 protected:
Ben Murdochb8e0da22011-05-16 14:20:40 +0100107 static const uc32 kEndOfInput = -1;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800108
Ben Murdochb0fe1622011-05-05 13:52:32 +0100109 // Ensures that the buffer_cursor_ points to the character at
110 // position pos_ of the input, if possible. If the position
111 // is at or after the end of the input, return false. If there
112 // are more characters available, return true.
113 virtual bool ReadBlock() = 0;
114 virtual unsigned SlowSeekForward(unsigned character_count) = 0;
115
116 const uc16* buffer_cursor_;
117 const uc16* buffer_end_;
118 unsigned pos_;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800119};
120
121
Ben Murdoch8b112d22011-06-08 16:22:53 +0100122class UnicodeCache {
Ben Murdochb0fe1622011-05-05 13:52:32 +0100123// ---------------------------------------------------------------------
Ben Murdoch8b112d22011-06-08 16:22:53 +0100124// Caching predicates used by scanners.
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800125 public:
Ben Murdoch8b112d22011-06-08 16:22:53 +0100126 UnicodeCache() {}
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800127 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
128
Steve Block44f0eee2011-05-26 01:26:41 +0100129 StaticResource<Utf8Decoder>* utf8_decoder() {
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800130 return &utf8_decoder_;
131 }
132
Steve Block44f0eee2011-05-26 01:26:41 +0100133 bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
134 bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
135 bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
136 bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800137
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800138 private:
Steve Block44f0eee2011-05-26 01:26:41 +0100139
140 unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
141 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
142 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
143 unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
144 StaticResource<Utf8Decoder> utf8_decoder_;
145
Ben Murdoch8b112d22011-06-08 16:22:53 +0100146 DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800147};
148
Ben Murdoch8b112d22011-06-08 16:22:53 +0100149
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800150// ----------------------------------------------------------------------------
Steve Block9fac8402011-05-12 15:51:54 +0100151// LiteralBuffer - Collector of chars of literals.
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800152
Steve Block9fac8402011-05-12 15:51:54 +0100153class LiteralBuffer {
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800154 public:
Steve Block9fac8402011-05-12 15:51:54 +0100155 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800156
Steve Block9fac8402011-05-12 15:51:54 +0100157 ~LiteralBuffer() {
158 if (backing_store_.length() > 0) {
159 backing_store_.Dispose();
160 }
161 }
162
163 inline void AddChar(uc16 character) {
164 if (position_ >= backing_store_.length()) ExpandBuffer();
165 if (is_ascii_) {
166 if (character < kMaxAsciiCharCodeU) {
167 backing_store_[position_] = static_cast<byte>(character);
168 position_ += kASCIISize;
169 return;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800170 }
Steve Block9fac8402011-05-12 15:51:54 +0100171 ConvertToUC16();
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800172 }
Steve Block9fac8402011-05-12 15:51:54 +0100173 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
174 position_ += kUC16Size;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800175 }
176
Steve Block9fac8402011-05-12 15:51:54 +0100177 bool is_ascii() { return is_ascii_; }
178
179 Vector<const uc16> uc16_literal() {
180 ASSERT(!is_ascii_);
181 ASSERT((position_ & 0x1) == 0);
182 return Vector<const uc16>(
183 reinterpret_cast<const uc16*>(backing_store_.start()),
184 position_ >> 1);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800185 }
186
Steve Block9fac8402011-05-12 15:51:54 +0100187 Vector<const char> ascii_literal() {
188 ASSERT(is_ascii_);
189 return Vector<const char>(
190 reinterpret_cast<const char*>(backing_store_.start()),
191 position_);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800192 }
193
Steve Block9fac8402011-05-12 15:51:54 +0100194 int length() {
195 return is_ascii_ ? position_ : (position_ >> 1);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800196 }
197
198 void Reset() {
Steve Block9fac8402011-05-12 15:51:54 +0100199 position_ = 0;
200 is_ascii_ = true;
201 }
202 private:
203 static const int kInitialCapacity = 16;
204 static const int kGrowthFactory = 4;
205 static const int kMinConversionSlack = 256;
206 static const int kMaxGrowth = 1 * MB;
207 inline int NewCapacity(int min_capacity) {
208 int capacity = Max(min_capacity, backing_store_.length());
209 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
210 return new_capacity;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800211 }
212
Steve Block9fac8402011-05-12 15:51:54 +0100213 void ExpandBuffer() {
214 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
215 memcpy(new_store.start(), backing_store_.start(), position_);
216 backing_store_.Dispose();
217 backing_store_ = new_store;
218 }
219
220 void ConvertToUC16() {
221 ASSERT(is_ascii_);
222 Vector<byte> new_store;
223 int new_content_size = position_ * kUC16Size;
224 if (new_content_size >= backing_store_.length()) {
225 // Ensure room for all currently read characters as UC16 as well
226 // as the character about to be stored.
227 new_store = Vector<byte>::New(NewCapacity(new_content_size));
228 } else {
229 new_store = backing_store_;
230 }
231 char* src = reinterpret_cast<char*>(backing_store_.start());
232 uc16* dst = reinterpret_cast<uc16*>(new_store.start());
233 for (int i = position_ - 1; i >= 0; i--) {
234 dst[i] = src[i];
235 }
236 if (new_store.start() != backing_store_.start()) {
237 backing_store_.Dispose();
238 backing_store_ = new_store;
239 }
240 position_ = new_content_size;
241 is_ascii_ = false;
242 }
243
244 bool is_ascii_;
245 int position_;
246 Vector<byte> backing_store_;
Steve Block44f0eee2011-05-26 01:26:41 +0100247
248 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800249};
250
Steve Block9fac8402011-05-12 15:51:54 +0100251
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800252// ----------------------------------------------------------------------------
253// Scanner base-class.
254
255// Generic functionality used by both JSON and JavaScript scanners.
256class Scanner {
257 public:
Steve Block1e0659c2011-05-24 12:43:12 +0100258 // -1 is outside of the range of any real source code.
259 static const int kNoOctalLocation = -1;
260
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800261 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
262
263 class LiteralScope {
264 public:
265 explicit LiteralScope(Scanner* self);
266 ~LiteralScope();
267 void Complete();
268
269 private:
270 Scanner* scanner_;
271 bool complete_;
272 };
273
Ben Murdoch8b112d22011-06-08 16:22:53 +0100274 explicit Scanner(UnicodeCache* scanner_contants);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800275
276 // Returns the current token again.
277 Token::Value current_token() { return current_.token; }
278
279 // One token look-ahead (past the token returned by Next()).
280 Token::Value peek() const { return next_.token; }
281
282 struct Location {
283 Location(int b, int e) : beg_pos(b), end_pos(e) { }
284 Location() : beg_pos(0), end_pos(0) { }
Steve Block1e0659c2011-05-24 12:43:12 +0100285
286 bool IsValid() const {
287 return beg_pos >= 0 && end_pos >= beg_pos;
288 }
289
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800290 int beg_pos;
291 int end_pos;
292 };
293
Steve Block1e0659c2011-05-24 12:43:12 +0100294 static Location NoLocation() {
295 return Location(-1, -1);
296 }
297
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800298 // Returns the location information for the current token
299 // (the token returned by Next()).
300 Location location() const { return current_.location; }
301 Location peek_location() const { return next_.location; }
302
Steve Block1e0659c2011-05-24 12:43:12 +0100303 // Returns the location of the last seen octal literal
304 int octal_position() const { return octal_pos_; }
305 void clear_octal_position() { octal_pos_ = -1; }
306
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800307 // Returns the literal string, if any, for the current token (the
308 // token returned by Next()). The string is 0-terminated and in
309 // UTF-8 format; they may contain 0-characters. Literal strings are
310 // collected for identifiers, strings, and numbers.
311 // These functions only give the correct result if the literal
312 // was scanned between calls to StartLiteral() and TerminateLiteral().
Steve Block9fac8402011-05-12 15:51:54 +0100313 bool is_literal_ascii() {
314 ASSERT_NOT_NULL(current_.literal_chars);
315 return current_.literal_chars->is_ascii();
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800316 }
Steve Block9fac8402011-05-12 15:51:54 +0100317 Vector<const char> literal_ascii_string() {
318 ASSERT_NOT_NULL(current_.literal_chars);
319 return current_.literal_chars->ascii_literal();
320 }
321 Vector<const uc16> literal_uc16_string() {
322 ASSERT_NOT_NULL(current_.literal_chars);
323 return current_.literal_chars->uc16_literal();
324 }
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800325 int literal_length() const {
Steve Block9fac8402011-05-12 15:51:54 +0100326 ASSERT_NOT_NULL(current_.literal_chars);
327 return current_.literal_chars->length();
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800328 }
329
330 // Returns the literal string for the next token (the token that
331 // would be returned if Next() were called).
Steve Block9fac8402011-05-12 15:51:54 +0100332 bool is_next_literal_ascii() {
333 ASSERT_NOT_NULL(next_.literal_chars);
334 return next_.literal_chars->is_ascii();
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800335 }
Steve Block9fac8402011-05-12 15:51:54 +0100336 Vector<const char> next_literal_ascii_string() {
337 ASSERT_NOT_NULL(next_.literal_chars);
338 return next_.literal_chars->ascii_literal();
339 }
340 Vector<const uc16> next_literal_uc16_string() {
341 ASSERT_NOT_NULL(next_.literal_chars);
342 return next_.literal_chars->uc16_literal();
343 }
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800344 int next_literal_length() const {
Steve Block9fac8402011-05-12 15:51:54 +0100345 ASSERT_NOT_NULL(next_.literal_chars);
346 return next_.literal_chars->length();
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800347 }
348
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800349 static const int kCharacterLookaheadBufferSize = 1;
350
351 protected:
352 // The current and look-ahead token.
353 struct TokenDesc {
354 Token::Value token;
355 Location location;
Steve Block9fac8402011-05-12 15:51:54 +0100356 LiteralBuffer* literal_chars;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800357 };
358
359 // Call this after setting source_ to the input.
360 void Init() {
361 // Set c0_ (one character ahead)
362 ASSERT(kCharacterLookaheadBufferSize == 1);
363 Advance();
364 // Initialize current_ to not refer to a literal.
Steve Block9fac8402011-05-12 15:51:54 +0100365 current_.literal_chars = NULL;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800366 }
367
368 // Literal buffer support
369 inline void StartLiteral() {
Steve Block9fac8402011-05-12 15:51:54 +0100370 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
371 &literal_buffer2_ : &literal_buffer1_;
372 free_buffer->Reset();
373 next_.literal_chars = free_buffer;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800374 }
375
376 inline void AddLiteralChar(uc32 c) {
Steve Block9fac8402011-05-12 15:51:54 +0100377 ASSERT_NOT_NULL(next_.literal_chars);
378 next_.literal_chars->AddChar(c);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800379 }
380
381 // Complete scanning of a literal.
382 inline void TerminateLiteral() {
Steve Block9fac8402011-05-12 15:51:54 +0100383 // Does nothing in the current implementation.
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800384 }
385
386 // Stops scanning of a literal and drop the collected characters,
387 // e.g., due to an encountered error.
388 inline void DropLiteral() {
Steve Block9fac8402011-05-12 15:51:54 +0100389 next_.literal_chars = NULL;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800390 }
391
392 inline void AddLiteralCharAdvance() {
393 AddLiteralChar(c0_);
394 Advance();
395 }
396
397 // Low-level scanning support.
398 void Advance() { c0_ = source_->Advance(); }
399 void PushBack(uc32 ch) {
Ben Murdochb0fe1622011-05-05 13:52:32 +0100400 source_->PushBack(c0_);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800401 c0_ = ch;
402 }
403
404 inline Token::Value Select(Token::Value tok) {
405 Advance();
406 return tok;
407 }
408
409 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
410 Advance();
411 if (c0_ == next) {
412 Advance();
413 return then;
414 } else {
415 return else_;
416 }
417 }
418
419 uc32 ScanHexEscape(uc32 c, int length);
Steve Block1e0659c2011-05-24 12:43:12 +0100420
421 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800422 uc32 ScanOctalEscape(uc32 c, int length);
423
424 // Return the current source position.
425 int source_pos() {
426 return source_->pos() - kCharacterLookaheadBufferSize;
427 }
428
Ben Murdoch8b112d22011-06-08 16:22:53 +0100429 UnicodeCache* unicode_cache_;
Steve Block44f0eee2011-05-26 01:26:41 +0100430
Steve Block9fac8402011-05-12 15:51:54 +0100431 // Buffers collecting literal strings, numbers, etc.
432 LiteralBuffer literal_buffer1_;
433 LiteralBuffer literal_buffer2_;
434
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800435 TokenDesc current_; // desc for current token (as returned by Next())
436 TokenDesc next_; // desc for next token (one token look-ahead)
437
Ben Murdochb0fe1622011-05-05 13:52:32 +0100438 // Input stream. Must be initialized to an UC16CharacterStream.
439 UC16CharacterStream* source_;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800440
Steve Block1e0659c2011-05-24 12:43:12 +0100441 // Start position of the octal literal last scanned.
442 int octal_pos_;
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800443
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800444 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
445 uc32 c0_;
446};
447
448// ----------------------------------------------------------------------------
449// JavaScriptScanner - base logic for JavaScript scanning.
450
451class JavaScriptScanner : public Scanner {
452 public:
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800453 // A LiteralScope that disables recording of some types of JavaScript
454 // literals. If the scanner is configured to not record the specific
455 // type of literal, the scope will not call StartLiteral.
456 class LiteralScope {
457 public:
Steve Block9fac8402011-05-12 15:51:54 +0100458 explicit LiteralScope(JavaScriptScanner* self)
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800459 : scanner_(self), complete_(false) {
Steve Block9fac8402011-05-12 15:51:54 +0100460 scanner_->StartLiteral();
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800461 }
462 ~LiteralScope() {
463 if (!complete_) scanner_->DropLiteral();
464 }
465 void Complete() {
466 scanner_->TerminateLiteral();
467 complete_ = true;
468 }
469
470 private:
471 JavaScriptScanner* scanner_;
472 bool complete_;
473 };
474
Ben Murdoch8b112d22011-06-08 16:22:53 +0100475 explicit JavaScriptScanner(UnicodeCache* scanner_contants);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800476
477 // Returns the next token.
478 Token::Value Next();
479
480 // Returns true if there was a line terminator before the peek'ed token.
481 bool has_line_terminator_before_next() const {
482 return has_line_terminator_before_next_;
483 }
484
485 // Scans the input as a regular expression pattern, previous
486 // character(s) must be /(=). Returns true if a pattern is scanned.
487 bool ScanRegExpPattern(bool seen_equal);
488 // Returns true if regexp flags are scanned (always since flags can
489 // be empty).
490 bool ScanRegExpFlags();
491
492 // Tells whether the buffer contains an identifier (no escapes).
493 // Used for checking if a property name is an identifier.
494 static bool IsIdentifier(unibrow::CharacterStream* buffer);
495
496 // Seek forward to the given position. This operation does not
497 // work in general, for instance when there are pushed back
498 // characters, but works for seeking forward until simple delimiter
499 // tokens, which is what it is used for.
500 void SeekForward(int pos);
501
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800502 protected:
503 bool SkipWhiteSpace();
504 Token::Value SkipSingleLineComment();
505 Token::Value SkipMultiLineComment();
506
507 // Scans a single JavaScript token.
508 void Scan();
509
510 void ScanDecimalDigits();
511 Token::Value ScanNumber(bool seen_period);
512 Token::Value ScanIdentifierOrKeyword();
513 Token::Value ScanIdentifierSuffix(LiteralScope* literal);
514
515 void ScanEscape();
516 Token::Value ScanString();
517
518 // Scans a possible HTML comment -- begins with '<!'.
519 Token::Value ScanHtmlComment();
520
521 // Decodes a unicode escape-sequence which is part of an identifier.
522 // If the escape sequence cannot be decoded the result is kBadChar.
523 uc32 ScanIdentifierUnicodeEscape();
524
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800525 bool has_line_terminator_before_next_;
526};
527
528
529// ----------------------------------------------------------------------------
530// Keyword matching state machine.
531
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800532class KeywordMatcher {
533// Incrementally recognize keywords.
534//
535// Recognized keywords:
536// break case catch const* continue debugger* default delete do else
537// finally false for function if in instanceof native* new null
538// return switch this throw true try typeof var void while with
539//
540// *: Actually "future reserved keywords". These are the only ones we
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800541// recognize, the remaining are allowed as identifiers.
542// In ES5 strict mode, we should disallow all reserved keywords.
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800543 public:
544 KeywordMatcher()
545 : state_(INITIAL),
546 token_(Token::IDENTIFIER),
547 keyword_(NULL),
548 counter_(0),
549 keyword_token_(Token::ILLEGAL) {}
550
551 Token::Value token() { return token_; }
552
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800553 inline bool AddChar(unibrow::uchar input) {
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800554 if (state_ != UNMATCHABLE) {
555 Step(input);
556 }
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800557 return state_ != UNMATCHABLE;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800558 }
559
560 void Fail() {
561 token_ = Token::IDENTIFIER;
562 state_ = UNMATCHABLE;
563 }
564
565 private:
566 enum State {
567 UNMATCHABLE,
568 INITIAL,
569 KEYWORD_PREFIX,
570 KEYWORD_MATCHED,
571 C,
572 CA,
573 CO,
574 CON,
575 D,
576 DE,
Steve Block1e0659c2011-05-24 12:43:12 +0100577 E,
578 EX,
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800579 F,
580 I,
Steve Block1e0659c2011-05-24 12:43:12 +0100581 IM,
582 IMP,
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800583 IN,
584 N,
Steve Block1e0659c2011-05-24 12:43:12 +0100585 P,
586 PR,
587 S,
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800588 T,
589 TH,
590 TR,
591 V,
592 W
593 };
594
595 struct FirstState {
596 const char* keyword;
597 State state;
598 Token::Value token;
599 };
600
601 // Range of possible first characters of a keyword.
602 static const unsigned int kFirstCharRangeMin = 'b';
Steve Block1e0659c2011-05-24 12:43:12 +0100603 static const unsigned int kFirstCharRangeMax = 'y';
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800604 static const unsigned int kFirstCharRangeLength =
605 kFirstCharRangeMax - kFirstCharRangeMin + 1;
606 // State map for first keyword character range.
607 static FirstState first_states_[kFirstCharRangeLength];
608
609 // If input equals keyword's character at position, continue matching keyword
610 // from that position.
611 inline bool MatchKeywordStart(unibrow::uchar input,
612 const char* keyword,
613 int position,
614 Token::Value token_if_match) {
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800615 if (input != static_cast<unibrow::uchar>(keyword[position])) {
616 return false;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800617 }
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800618 state_ = KEYWORD_PREFIX;
619 this->keyword_ = keyword;
620 this->counter_ = position + 1;
621 this->keyword_token_ = token_if_match;
622 return true;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800623 }
624
625 // If input equals match character, transition to new state and return true.
626 inline bool MatchState(unibrow::uchar input, char match, State new_state) {
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800627 if (input != static_cast<unibrow::uchar>(match)) {
628 return false;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800629 }
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800630 state_ = new_state;
631 return true;
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800632 }
633
634 inline bool MatchKeyword(unibrow::uchar input,
635 char match,
636 State new_state,
637 Token::Value keyword_token) {
638 if (input != static_cast<unibrow::uchar>(match)) {
639 return false;
640 }
641 state_ = new_state;
642 token_ = keyword_token;
643 return true;
644 }
645
646 void Step(unibrow::uchar input);
647
648 // Current state.
649 State state_;
650 // Token for currently added characters.
651 Token::Value token_;
652
653 // Matching a specific keyword string (there is only one possible valid
654 // keyword with the current prefix).
655 const char* keyword_;
656 int counter_;
657 Token::Value keyword_token_;
658};
659
660
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -0800661} } // namespace v8::internal
662
663#endif // V8_SCANNER_BASE_H_