blob: d3776e5f8996719561bd037f6c764e28171608d0 [file] [log] [blame]
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +00001// Copyright 2011 the V8 project authors. All rights reserved.
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28// Features shared by parsing and pre-parsing scanners.
29
30#ifndef V8_SCANNER_BASE_H_
31#define V8_SCANNER_BASE_H_
32
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000033#include "allocation.h"
fschneider@chromium.orgfb144a02011-05-04 12:43:48 +000034#include "char-predicates.h"
35#include "checks.h"
36#include "globals.h"
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +000037#include "token.h"
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000038#include "unicode-inl.h"
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000039#include "utils.h"
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +000040
41namespace v8 {
42namespace internal {
43
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000044// Returns the value (0 .. 15) of a hexadecimal character c.
45// If c is not a legal hexadecimal character, returns a value < 0.
46inline int HexValue(uc32 c) {
47 c -= '0';
48 if (static_cast<unsigned>(c) <= 9) return c;
49 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
50 if (static_cast<unsigned>(c) <= 5) return c + 10;
51 return -1;
52}
53
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000054
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000055// ---------------------------------------------------------------------
56// Buffered stream of characters, using an internal UC16 buffer.
57
58class UC16CharacterStream {
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000059 public:
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000060 UC16CharacterStream() : pos_(0) { }
61 virtual ~UC16CharacterStream() { }
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000062
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000063 // Returns and advances past the next UC16 character in the input
64 // stream. If there are no more characters, it returns a negative
65 // value.
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +000066 inline uc32 Advance() {
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000067 if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
68 pos_++;
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +000069 return static_cast<uc32>(*(buffer_cursor_++));
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000070 }
71 // Note: currently the following increment is necessary to avoid a
72 // parser problem! The scanner treats the final kEndOfInput as
73 // a character with a position, and does math relative to that
74 // position.
75 pos_++;
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000076
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000077 return kEndOfInput;
78 }
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000079
ager@chromium.org5f0c45f2010-12-17 08:51:21 +000080 // Return the current position in the character stream.
81 // Starts at zero.
82 inline unsigned pos() const { return pos_; }
83
84 // Skips forward past the next character_count UC16 characters
85 // in the input, or until the end of input if that comes sooner.
86 // Returns the number of characters actually skipped. If less
87 // than character_count,
88 inline unsigned SeekForward(unsigned character_count) {
89 unsigned buffered_chars =
90 static_cast<unsigned>(buffer_end_ - buffer_cursor_);
91 if (character_count <= buffered_chars) {
92 buffer_cursor_ += character_count;
93 pos_ += character_count;
94 return character_count;
95 }
96 return SlowSeekForward(character_count);
97 }
98
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +000099 // Pushes back the most recently read UC16 character (or negative
100 // value if at end of input), i.e., the value returned by the most recent
101 // call to Advance.
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000102 // Must not be used right after calling SeekForward.
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +0000103 virtual void PushBack(int32_t character) = 0;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000104
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000105 protected:
sgjesse@chromium.orgc6c57182011-01-17 12:24:25 +0000106 static const uc32 kEndOfInput = -1;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000107
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000108 // Ensures that the buffer_cursor_ points to the character at
109 // position pos_ of the input, if possible. If the position
110 // is at or after the end of the input, return false. If there
111 // are more characters available, return true.
112 virtual bool ReadBlock() = 0;
113 virtual unsigned SlowSeekForward(unsigned character_count) = 0;
114
115 const uc16* buffer_cursor_;
116 const uc16* buffer_end_;
117 unsigned pos_;
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000118};
119
120
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000121class UnicodeCache {
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000122// ---------------------------------------------------------------------
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000123// Caching predicates used by scanners.
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000124 public:
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000125 UnicodeCache() {}
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000126 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
127
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000128 StaticResource<Utf8Decoder>* utf8_decoder() {
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000129 return &utf8_decoder_;
130 }
131
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000132 bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); }
133 bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); }
134 bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
135 bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000136
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000137 private:
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000138
139 unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
140 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
141 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
142 unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
143 StaticResource<Utf8Decoder> utf8_decoder_;
144
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000145 DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +0000146};
147
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000148
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000149// ----------------------------------------------------------------------------
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000150// LiteralBuffer - Collector of chars of literals.
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000151
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000152class LiteralBuffer {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000153 public:
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000154 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000155
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000156 ~LiteralBuffer() {
157 if (backing_store_.length() > 0) {
158 backing_store_.Dispose();
159 }
160 }
161
162 inline void AddChar(uc16 character) {
163 if (position_ >= backing_store_.length()) ExpandBuffer();
164 if (is_ascii_) {
165 if (character < kMaxAsciiCharCodeU) {
166 backing_store_[position_] = static_cast<byte>(character);
167 position_ += kASCIISize;
168 return;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000169 }
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000170 ConvertToUC16();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000171 }
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000172 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
173 position_ += kUC16Size;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000174 }
175
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000176 bool is_ascii() { return is_ascii_; }
177
178 Vector<const uc16> uc16_literal() {
179 ASSERT(!is_ascii_);
180 ASSERT((position_ & 0x1) == 0);
181 return Vector<const uc16>(
182 reinterpret_cast<const uc16*>(backing_store_.start()),
183 position_ >> 1);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000184 }
185
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000186 Vector<const char> ascii_literal() {
187 ASSERT(is_ascii_);
188 return Vector<const char>(
189 reinterpret_cast<const char*>(backing_store_.start()),
190 position_);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000191 }
192
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000193 int length() {
194 return is_ascii_ ? position_ : (position_ >> 1);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000195 }
196
197 void Reset() {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000198 position_ = 0;
199 is_ascii_ = true;
200 }
201 private:
202 static const int kInitialCapacity = 16;
203 static const int kGrowthFactory = 4;
204 static const int kMinConversionSlack = 256;
205 static const int kMaxGrowth = 1 * MB;
206 inline int NewCapacity(int min_capacity) {
207 int capacity = Max(min_capacity, backing_store_.length());
208 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
209 return new_capacity;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000210 }
211
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000212 void ExpandBuffer() {
213 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
214 memcpy(new_store.start(), backing_store_.start(), position_);
215 backing_store_.Dispose();
216 backing_store_ = new_store;
217 }
218
219 void ConvertToUC16() {
220 ASSERT(is_ascii_);
221 Vector<byte> new_store;
222 int new_content_size = position_ * kUC16Size;
lrn@chromium.org5d00b602011-01-05 09:51:43 +0000223 if (new_content_size >= backing_store_.length()) {
224 // Ensure room for all currently read characters as UC16 as well
225 // as the character about to be stored.
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000226 new_store = Vector<byte>::New(NewCapacity(new_content_size));
227 } else {
228 new_store = backing_store_;
229 }
230 char* src = reinterpret_cast<char*>(backing_store_.start());
231 uc16* dst = reinterpret_cast<uc16*>(new_store.start());
232 for (int i = position_ - 1; i >= 0; i--) {
233 dst[i] = src[i];
234 }
235 if (new_store.start() != backing_store_.start()) {
236 backing_store_.Dispose();
237 backing_store_ = new_store;
238 }
239 position_ = new_content_size;
240 is_ascii_ = false;
241 }
242
243 bool is_ascii_;
244 int position_;
245 Vector<byte> backing_store_;
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000246
247 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000248};
249
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000250
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000251// ----------------------------------------------------------------------------
252// Scanner base-class.
253
254// Generic functionality used by both JSON and JavaScript scanners.
255class Scanner {
256 public:
ager@chromium.org0ee099b2011-01-25 14:06:47 +0000257 // -1 is outside of the range of any real source code.
258 static const int kNoOctalLocation = -1;
259
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000260 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
261
262 class LiteralScope {
263 public:
264 explicit LiteralScope(Scanner* self);
265 ~LiteralScope();
266 void Complete();
267
268 private:
269 Scanner* scanner_;
270 bool complete_;
271 };
272
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000273 explicit Scanner(UnicodeCache* scanner_contants);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000274
275 // Returns the current token again.
276 Token::Value current_token() { return current_.token; }
277
278 // One token look-ahead (past the token returned by Next()).
279 Token::Value peek() const { return next_.token; }
280
281 struct Location {
282 Location(int b, int e) : beg_pos(b), end_pos(e) { }
283 Location() : beg_pos(0), end_pos(0) { }
ager@chromium.org378b34e2011-01-28 08:04:38 +0000284
285 bool IsValid() const {
286 return beg_pos >= 0 && end_pos >= beg_pos;
287 }
288
lrn@chromium.org1c092762011-05-09 09:42:16 +0000289 static Location invalid() { return Location(-1, -1); }
290
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000291 int beg_pos;
292 int end_pos;
293 };
294
295 // Returns the location information for the current token
296 // (the token returned by Next()).
297 Location location() const { return current_.location; }
298 Location peek_location() const { return next_.location; }
299
300 // Returns the literal string, if any, for the current token (the
301 // token returned by Next()). The string is 0-terminated and in
302 // UTF-8 format; they may contain 0-characters. Literal strings are
303 // collected for identifiers, strings, and numbers.
304 // These functions only give the correct result if the literal
305 // was scanned between calls to StartLiteral() and TerminateLiteral().
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000306 bool is_literal_ascii() {
307 ASSERT_NOT_NULL(current_.literal_chars);
308 return current_.literal_chars->is_ascii();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000309 }
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000310 Vector<const char> literal_ascii_string() {
311 ASSERT_NOT_NULL(current_.literal_chars);
312 return current_.literal_chars->ascii_literal();
313 }
314 Vector<const uc16> literal_uc16_string() {
315 ASSERT_NOT_NULL(current_.literal_chars);
316 return current_.literal_chars->uc16_literal();
317 }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000318 int literal_length() const {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000319 ASSERT_NOT_NULL(current_.literal_chars);
320 return current_.literal_chars->length();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000321 }
322
lrn@chromium.org1c092762011-05-09 09:42:16 +0000323 bool literal_contains_escapes() const {
324 Location location = current_.location;
325 int source_length = (location.end_pos - location.beg_pos);
326 if (current_.token == Token::STRING) {
327 // Subtract delimiters.
328 source_length -= 2;
329 }
330 return current_.literal_chars->length() != source_length;
331 }
332
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000333 // Returns the literal string for the next token (the token that
334 // would be returned if Next() were called).
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000335 bool is_next_literal_ascii() {
336 ASSERT_NOT_NULL(next_.literal_chars);
337 return next_.literal_chars->is_ascii();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000338 }
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000339 Vector<const char> next_literal_ascii_string() {
340 ASSERT_NOT_NULL(next_.literal_chars);
341 return next_.literal_chars->ascii_literal();
342 }
343 Vector<const uc16> next_literal_uc16_string() {
344 ASSERT_NOT_NULL(next_.literal_chars);
345 return next_.literal_chars->uc16_literal();
346 }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000347 int next_literal_length() const {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000348 ASSERT_NOT_NULL(next_.literal_chars);
349 return next_.literal_chars->length();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000350 }
351
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000352 static const int kCharacterLookaheadBufferSize = 1;
353
354 protected:
355 // The current and look-ahead token.
356 struct TokenDesc {
357 Token::Value token;
358 Location location;
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000359 LiteralBuffer* literal_chars;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000360 };
361
362 // Call this after setting source_ to the input.
363 void Init() {
364 // Set c0_ (one character ahead)
365 ASSERT(kCharacterLookaheadBufferSize == 1);
366 Advance();
367 // Initialize current_ to not refer to a literal.
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000368 current_.literal_chars = NULL;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000369 }
370
371 // Literal buffer support
372 inline void StartLiteral() {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000373 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ?
374 &literal_buffer2_ : &literal_buffer1_;
375 free_buffer->Reset();
376 next_.literal_chars = free_buffer;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000377 }
378
379 inline void AddLiteralChar(uc32 c) {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000380 ASSERT_NOT_NULL(next_.literal_chars);
381 next_.literal_chars->AddChar(c);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000382 }
383
384 // Complete scanning of a literal.
385 inline void TerminateLiteral() {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000386 // Does nothing in the current implementation.
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000387 }
388
389 // Stops scanning of a literal and drop the collected characters,
390 // e.g., due to an encountered error.
391 inline void DropLiteral() {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000392 next_.literal_chars = NULL;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000393 }
394
395 inline void AddLiteralCharAdvance() {
396 AddLiteralChar(c0_);
397 Advance();
398 }
399
400 // Low-level scanning support.
401 void Advance() { c0_ = source_->Advance(); }
402 void PushBack(uc32 ch) {
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000403 source_->PushBack(c0_);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000404 c0_ = ch;
405 }
406
407 inline Token::Value Select(Token::Value tok) {
408 Advance();
409 return tok;
410 }
411
412 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
413 Advance();
414 if (c0_ == next) {
415 Advance();
416 return then;
417 } else {
418 return else_;
419 }
420 }
421
422 uc32 ScanHexEscape(uc32 c, int length);
kmillikin@chromium.org31b12772011-02-02 16:08:26 +0000423
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000424 // Return the current source position.
425 int source_pos() {
426 return source_->pos() - kCharacterLookaheadBufferSize;
427 }
428
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000429 UnicodeCache* unicode_cache_;
sgjesse@chromium.orgea88ce92011-03-23 11:19:56 +0000430
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000431 // Buffers collecting literal strings, numbers, etc.
432 LiteralBuffer literal_buffer1_;
433 LiteralBuffer literal_buffer2_;
434
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000435 TokenDesc current_; // desc for current token (as returned by Next())
436 TokenDesc next_; // desc for next token (one token look-ahead)
437
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000438 // Input stream. Must be initialized to an UC16CharacterStream.
439 UC16CharacterStream* source_;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000440
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000441 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
442 uc32 c0_;
443};
444
445// ----------------------------------------------------------------------------
446// JavaScriptScanner - base logic for JavaScript scanning.
447
448class JavaScriptScanner : public Scanner {
449 public:
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000450 // A LiteralScope that disables recording of some types of JavaScript
451 // literals. If the scanner is configured to not record the specific
452 // type of literal, the scope will not call StartLiteral.
453 class LiteralScope {
454 public:
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000455 explicit LiteralScope(JavaScriptScanner* self)
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000456 : scanner_(self), complete_(false) {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000457 scanner_->StartLiteral();
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000458 }
459 ~LiteralScope() {
460 if (!complete_) scanner_->DropLiteral();
461 }
462 void Complete() {
463 scanner_->TerminateLiteral();
464 complete_ = true;
465 }
466
467 private:
468 JavaScriptScanner* scanner_;
469 bool complete_;
470 };
471
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000472 explicit JavaScriptScanner(UnicodeCache* scanner_contants);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000473
lrn@chromium.orgac2828d2011-06-23 06:29:21 +0000474 void Initialize(UC16CharacterStream* source);
475
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000476 // Returns the next token.
477 Token::Value Next();
478
whesse@chromium.orgdf8c03c2011-06-21 14:36:03 +0000479 // Returns true if there was a line terminator before the peek'ed token,
480 // possibly inside a multi-line comment.
481 bool HasAnyLineTerminatorBeforeNext() const {
482 return has_line_terminator_before_next_ ||
483 has_multiline_comment_before_next_;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000484 }
485
486 // Scans the input as a regular expression pattern, previous
487 // character(s) must be /(=). Returns true if a pattern is scanned.
488 bool ScanRegExpPattern(bool seen_equal);
489 // Returns true if regexp flags are scanned (always since flags can
490 // be empty).
491 bool ScanRegExpFlags();
492
493 // Tells whether the buffer contains an identifier (no escapes).
494 // Used for checking if a property name is an identifier.
495 static bool IsIdentifier(unibrow::CharacterStream* buffer);
496
lrn@chromium.org1c092762011-05-09 09:42:16 +0000497 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
498 uc32 ScanOctalEscape(uc32 c, int length);
499
500 // Returns the location of the last seen octal literal
501 Location octal_position() const { return octal_pos_; }
502 void clear_octal_position() { octal_pos_ = Location::invalid(); }
503
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000504 // Seek forward to the given position. This operation does not
505 // work in general, for instance when there are pushed back
506 // characters, but works for seeking forward until simple delimiter
507 // tokens, which is what it is used for.
508 void SeekForward(int pos);
509
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000510 protected:
511 bool SkipWhiteSpace();
512 Token::Value SkipSingleLineComment();
513 Token::Value SkipMultiLineComment();
514
515 // Scans a single JavaScript token.
516 void Scan();
517
518 void ScanDecimalDigits();
519 Token::Value ScanNumber(bool seen_period);
520 Token::Value ScanIdentifierOrKeyword();
521 Token::Value ScanIdentifierSuffix(LiteralScope* literal);
522
523 void ScanEscape();
524 Token::Value ScanString();
525
526 // Scans a possible HTML comment -- begins with '<!'.
527 Token::Value ScanHtmlComment();
528
529 // Decodes a unicode escape-sequence which is part of an identifier.
530 // If the escape sequence cannot be decoded the result is kBadChar.
531 uc32 ScanIdentifierUnicodeEscape();
532
lrn@chromium.org1c092762011-05-09 09:42:16 +0000533 // Start position of the octal literal last scanned.
534 Location octal_pos_;
535
whesse@chromium.orgdf8c03c2011-06-21 14:36:03 +0000536 // Whether there is a line terminator whitespace character after
537 // the current token, and before the next. Does not count newlines
538 // inside multiline comments.
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000539 bool has_line_terminator_before_next_;
lrn@chromium.orgac2828d2011-06-23 06:29:21 +0000540 // Whether there is a multi-line comment that contains a
whesse@chromium.orgdf8c03c2011-06-21 14:36:03 +0000541 // line-terminator after the current token, and before the next.
542 bool has_multiline_comment_before_next_;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000543};
544
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000545} } // namespace v8::internal
546
547#endif // V8_SCANNER_BASE_H_