blob: 24a6d4be9cbf79ea11d6f7d65d5a5decdf45146a [file] [log] [blame]
ager@chromium.org9258b6b2008-09-11 09:11:10 +00001// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
30#include "ast.h"
31#include "scanner.h"
32
kasperl@chromium.org71affb52009-05-26 05:44:31 +000033namespace v8 {
34namespace internal {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000035
36// ----------------------------------------------------------------------------
37// Character predicates
38
39
40unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
41unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
42unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
43unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
44
45
46StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
47
48
49// ----------------------------------------------------------------------------
50// UTF8Buffer
51
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000052UTF8Buffer::UTF8Buffer() {
53 static const int kInitialCapacity = 1 * KB;
54 data_ = NewArray<char>(kInitialCapacity);
55 limit_ = ComputeLimit(data_, kInitialCapacity);
56 Reset();
57 ASSERT(Capacity() == kInitialCapacity && pos() == 0);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000058}
59
60
61UTF8Buffer::~UTF8Buffer() {
62 DeleteArray(data_);
63}
64
65
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000066void UTF8Buffer::AddCharSlow(uc32 c) {
67 static const int kCapacityGrowthLimit = 1 * MB;
68 if (cursor_ > limit_) {
69 int old_capacity = Capacity();
70 int old_position = pos();
71 int new_capacity =
72 Min(old_capacity * 2, old_capacity + kCapacityGrowthLimit);
73 char* new_data = NewArray<char>(new_capacity);
74 memcpy(new_data, data_, old_position);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000075 DeleteArray(data_);
76 data_ = new_data;
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000077 cursor_ = new_data + old_position;
78 limit_ = ComputeLimit(new_data, new_capacity);
79 ASSERT(Capacity() == new_capacity && pos() == old_position);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000080 }
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000081 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
82 *cursor_++ = c; // Common case: 7-bit ASCII.
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000083 } else {
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000084 cursor_ += unibrow::Utf8::Encode(cursor_, c);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000085 }
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +000086 ASSERT(pos() <= Capacity());
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +000087}
88
89
90// ----------------------------------------------------------------------------
91// UTF16Buffer
92
93
94UTF16Buffer::UTF16Buffer()
95 : pos_(0),
96 pushback_buffer_(0),
97 last_(0),
98 stream_(NULL) { }
99
100
101void UTF16Buffer::Initialize(Handle<String> data,
102 unibrow::CharacterStream* input) {
103 data_ = data;
104 pos_ = 0;
105 stream_ = input;
106}
107
108
109Handle<String> UTF16Buffer::SubString(int start, int end) {
110 return internal::SubString(data_, start, end);
111}
112
113
114void UTF16Buffer::PushBack(uc32 ch) {
115 pushback_buffer()->Add(last_);
116 last_ = ch;
117 pos_--;
118}
119
120
121uc32 UTF16Buffer::Advance() {
122 // NOTE: It is of importance to Persian / Farsi resources that we do
123 // *not* strip format control characters in the scanner; see
124 //
125 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152
126 //
127 // So, even though ECMA-262, section 7.1, page 11, dictates that we
128 // must remove Unicode format-control characters, we do not. This is
129 // in line with how IE and SpiderMonkey handles it.
130 if (!pushback_buffer()->is_empty()) {
131 pos_++;
132 return last_ = pushback_buffer()->RemoveLast();
133 } else if (stream_->has_more()) {
134 pos_++;
135 uc32 next = stream_->GetNext();
136 return last_ = next;
137 } else {
138 // note: currently the following increment is necessary to avoid a
139 // test-parser problem!
140 pos_++;
141 return last_ = static_cast<uc32>(-1);
142 }
143}
144
145
146void UTF16Buffer::SeekForward(int pos) {
147 pos_ = pos;
148 ASSERT(pushback_buffer()->is_empty());
149 stream_->Seek(pos);
150}
151
152
153// ----------------------------------------------------------------------------
154// Scanner
155
156Scanner::Scanner(bool pre) : stack_overflow_(false), is_pre_parsing_(pre) {
157 Token::Initialize();
158}
159
160
161void Scanner::Init(Handle<String> source, unibrow::CharacterStream* stream,
162 int position) {
163 // Initialize the source buffer.
164 source_.Initialize(source, stream);
165 position_ = position;
166
167 // Reset literals buffer
168 literals_.Reset();
169
170 // Set c0_ (one character ahead)
171 ASSERT(kCharacterLookaheadBufferSize == 1);
172 Advance();
173
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000174 // Skip initial whitespace allowing HTML comment ends just like
175 // after a newline and scan first token.
176 has_line_terminator_before_next_ = true;
177 SkipWhiteSpace();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000178 Scan();
179}
180
181
182Handle<String> Scanner::SubString(int start, int end) {
183 return source_.SubString(start - position_, end - position_);
184}
185
186
187Token::Value Scanner::Next() {
188 // BUG 1215673: Find a thread safe way to set a stack limit in
189 // pre-parse mode. Otherwise, we cannot safely pre-parse from other
190 // threads.
191 current_ = next_;
192 // Check for stack-overflow before returning any tokens.
193 StackLimitCheck check;
194 if (check.HasOverflowed()) {
195 stack_overflow_ = true;
196 next_.token = Token::ILLEGAL;
197 } else {
198 Scan();
199 }
200 return current_.token;
201}
202
203
204void Scanner::StartLiteral() {
205 next_.literal_pos = literals_.pos();
206}
207
208
209void Scanner::AddChar(uc32 c) {
210 literals_.AddChar(c);
211}
212
213
214void Scanner::TerminateLiteral() {
215 next_.literal_end = literals_.pos();
216 AddChar(0);
217}
218
219
220void Scanner::AddCharAdvance() {
221 AddChar(c0_);
222 Advance();
223}
224
225
226void Scanner::Advance() {
227 c0_ = source_.Advance();
228}
229
230
231void Scanner::PushBack(uc32 ch) {
232 source_.PushBack(ch);
233 c0_ = ch;
234}
235
236
ager@chromium.org3bf7b912008-11-17 09:09:45 +0000237static inline bool IsByteOrderMark(uc32 c) {
238 // The Unicode value U+FFFE is guaranteed never to be assigned as a
239 // Unicode character; this implies that in a Unicode context the
240 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
241 // character expressed in little-endian byte order (since it could
242 // not be a U+FFFE character expressed in big-endian byte
243 // order). Nevertheless, we check for it to be compatible with
244 // Spidermonkey.
245 return c == 0xFEFF || c == 0xFFFE;
246}
247
248
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000249bool Scanner::SkipWhiteSpace() {
250 int start_position = source_pos();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000251
252 while (true) {
ager@chromium.org3bf7b912008-11-17 09:09:45 +0000253 // We treat byte-order marks (BOMs) as whitespace for better
254 // compatibility with Spidermonkey and other JavaScript engines.
255 while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000256 // IsWhiteSpace() includes line terminators!
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000257 if (kIsLineTerminator.get(c0_)) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000258 // Ignore line terminators, but remember them. This is necessary
259 // for automatic semicolon insertion.
260 has_line_terminator_before_next_ = true;
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000261 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000262 Advance();
263 }
264
265 // If there is an HTML comment end '-->' at the beginning of a
266 // line (with only whitespace in front of it), we treat the rest
267 // of the line as a comment. This is in line with the way
268 // SpiderMonkey handles it.
269 if (c0_ == '-' && has_line_terminator_before_next_) {
270 Advance();
271 if (c0_ == '-') {
272 Advance();
273 if (c0_ == '>') {
274 // Treat the rest of the line as a comment.
275 SkipSingleLineComment();
276 // Continue skipping white space after the comment.
277 continue;
278 }
279 PushBack('-'); // undo Advance()
280 }
281 PushBack('-'); // undo Advance()
282 }
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000283 // Return whether or not we skipped any characters.
284 return source_pos() != start_position;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000285 }
286}
287
288
289Token::Value Scanner::SkipSingleLineComment() {
290 Advance();
291
292 // The line terminator at the end of the line is not considered
293 // to be part of the single-line comment; it is recognized
294 // separately by the lexical grammar and becomes part of the
295 // stream of input elements for the syntactic grammar (see
296 // ECMA-262, section 7.4, page 12).
297 while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
298 Advance();
299 }
300
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000301 return Token::WHITESPACE;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000302}
303
304
305Token::Value Scanner::SkipMultiLineComment() {
306 ASSERT(c0_ == '*');
307 Advance();
308
309 while (c0_ >= 0) {
310 char ch = c0_;
311 Advance();
312 // If we have reached the end of the multi-line comment, we
313 // consume the '/' and insert a whitespace. This way all
314 // multi-line comments are treated as whitespace - even the ones
315 // containing line terminators. This contradicts ECMA-262, section
316 // 7.4, page 12, that says that multi-line comments containing
317 // line terminators should be treated as a line terminator, but it
318 // matches the behaviour of SpiderMonkey and KJS.
319 if (ch == '*' && c0_ == '/') {
320 c0_ = ' ';
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000321 return Token::WHITESPACE;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000322 }
323 }
324
325 // Unterminated multi-line comment.
326 return Token::ILLEGAL;
327}
328
329
330Token::Value Scanner::ScanHtmlComment() {
331 // Check for <!-- comments.
332 ASSERT(c0_ == '!');
333 Advance();
334 if (c0_ == '-') {
335 Advance();
336 if (c0_ == '-') return SkipSingleLineComment();
337 PushBack('-'); // undo Advance()
338 }
339 PushBack('!'); // undo Advance()
340 ASSERT(c0_ == '!');
341 return Token::LT;
342}
343
344
345void Scanner::Scan() {
346 Token::Value token;
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000347 has_line_terminator_before_next_ = false;
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000348 do {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000349 // Remember the position of the next token
350 next_.location.beg_pos = source_pos();
351
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000352 switch (c0_) {
353 case ' ':
354 case '\t':
355 Advance();
356 token = Token::WHITESPACE;
357 break;
358
359 case '\n':
360 Advance();
361 has_line_terminator_before_next_ = true;
362 token = Token::WHITESPACE;
363 break;
364
365 case '"': case '\'':
366 token = ScanString();
367 break;
368
369 case '<':
370 // < <= << <<= <!--
371 Advance();
372 if (c0_ == '=') {
373 token = Select(Token::LTE);
374 } else if (c0_ == '<') {
375 token = Select('=', Token::ASSIGN_SHL, Token::SHL);
376 } else if (c0_ == '!') {
377 token = ScanHtmlComment();
378 } else {
379 token = Token::LT;
380 }
381 break;
382
383 case '>':
384 // > >= >> >>= >>> >>>=
385 Advance();
386 if (c0_ == '=') {
387 token = Select(Token::GTE);
388 } else if (c0_ == '>') {
389 // >> >>= >>> >>>=
390 Advance();
391 if (c0_ == '=') {
392 token = Select(Token::ASSIGN_SAR);
393 } else if (c0_ == '>') {
394 token = Select('=', Token::ASSIGN_SHR, Token::SHR);
395 } else {
396 token = Token::SAR;
397 }
398 } else {
399 token = Token::GT;
400 }
401 break;
402
403 case '=':
404 // = == ===
405 Advance();
406 if (c0_ == '=') {
407 token = Select('=', Token::EQ_STRICT, Token::EQ);
408 } else {
409 token = Token::ASSIGN;
410 }
411 break;
412
413 case '!':
414 // ! != !==
415 Advance();
416 if (c0_ == '=') {
417 token = Select('=', Token::NE_STRICT, Token::NE);
418 } else {
419 token = Token::NOT;
420 }
421 break;
422
423 case '+':
424 // + ++ +=
425 Advance();
426 if (c0_ == '+') {
427 token = Select(Token::INC);
428 } else if (c0_ == '=') {
429 token = Select(Token::ASSIGN_ADD);
430 } else {
431 token = Token::ADD;
432 }
433 break;
434
435 case '-':
436 // - -- --> -=
437 Advance();
438 if (c0_ == '-') {
439 Advance();
440 if (c0_ == '>' && has_line_terminator_before_next_) {
441 // For compatibility with SpiderMonkey, we skip lines that
442 // start with an HTML comment end '-->'.
443 token = SkipSingleLineComment();
444 } else {
445 token = Token::DEC;
446 }
447 } else if (c0_ == '=') {
448 token = Select(Token::ASSIGN_SUB);
449 } else {
450 token = Token::SUB;
451 }
452 break;
453
454 case '*':
455 // * *=
456 token = Select('=', Token::ASSIGN_MUL, Token::MUL);
457 break;
458
459 case '%':
460 // % %=
461 token = Select('=', Token::ASSIGN_MOD, Token::MOD);
462 break;
463
464 case '/':
465 // / // /* /=
466 Advance();
467 if (c0_ == '/') {
468 token = SkipSingleLineComment();
469 } else if (c0_ == '*') {
470 token = SkipMultiLineComment();
471 } else if (c0_ == '=') {
472 token = Select(Token::ASSIGN_DIV);
473 } else {
474 token = Token::DIV;
475 }
476 break;
477
478 case '&':
479 // & && &=
480 Advance();
481 if (c0_ == '&') {
482 token = Select(Token::AND);
483 } else if (c0_ == '=') {
484 token = Select(Token::ASSIGN_BIT_AND);
485 } else {
486 token = Token::BIT_AND;
487 }
488 break;
489
490 case '|':
491 // | || |=
492 Advance();
493 if (c0_ == '|') {
494 token = Select(Token::OR);
495 } else if (c0_ == '=') {
496 token = Select(Token::ASSIGN_BIT_OR);
497 } else {
498 token = Token::BIT_OR;
499 }
500 break;
501
502 case '^':
503 // ^ ^=
504 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
505 break;
506
507 case '.':
508 // . Number
509 Advance();
510 if (IsDecimalDigit(c0_)) {
511 token = ScanNumber(true);
512 } else {
513 token = Token::PERIOD;
514 }
515 break;
516
517 case ':':
518 token = Select(Token::COLON);
519 break;
520
521 case ';':
522 token = Select(Token::SEMICOLON);
523 break;
524
525 case ',':
526 token = Select(Token::COMMA);
527 break;
528
529 case '(':
530 token = Select(Token::LPAREN);
531 break;
532
533 case ')':
534 token = Select(Token::RPAREN);
535 break;
536
537 case '[':
538 token = Select(Token::LBRACK);
539 break;
540
541 case ']':
542 token = Select(Token::RBRACK);
543 break;
544
545 case '{':
546 token = Select(Token::LBRACE);
547 break;
548
549 case '}':
550 token = Select(Token::RBRACE);
551 break;
552
553 case '?':
554 token = Select(Token::CONDITIONAL);
555 break;
556
557 case '~':
558 token = Select(Token::BIT_NOT);
559 break;
560
561 default:
562 if (kIsIdentifierStart.get(c0_)) {
563 token = ScanIdentifier();
564 } else if (IsDecimalDigit(c0_)) {
565 token = ScanNumber(false);
566 } else if (SkipWhiteSpace()) {
567 token = Token::WHITESPACE;
568 } else if (c0_ < 0) {
569 token = Token::EOS;
570 } else {
571 token = Select(Token::ILLEGAL);
572 }
573 break;
574 }
575
576 // Continue scanning for tokens as long as we're just skipping
577 // whitespace.
578 } while (token == Token::WHITESPACE);
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000579
580 next_.location.end_pos = source_pos();
581 next_.token = token;
582}
583
584
585void Scanner::SeekForward(int pos) {
586 source_.SeekForward(pos - 1);
587 Advance();
588 Scan();
589}
590
591
592uc32 Scanner::ScanHexEscape(uc32 c, int length) {
593 ASSERT(length <= 4); // prevent overflow
594
595 uc32 digits[4];
596 uc32 x = 0;
597 for (int i = 0; i < length; i++) {
598 digits[i] = c0_;
599 int d = HexValue(c0_);
600 if (d < 0) {
601 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
602 // should be illegal, but other JS VMs just return the
603 // non-escaped version of the original character.
604
605 // Push back digits read, except the last one (in c0_).
606 for (int j = i-1; j >= 0; j--) {
607 PushBack(digits[j]);
608 }
ager@chromium.org6f10e412009-02-13 10:11:16 +0000609 // Notice: No handling of error - treat it as "\u"->"u".
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000610 return c;
611 }
612 x = x * 16 + d;
613 Advance();
614 }
615
616 return x;
617}
618
619
620// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
621// ECMA-262. Other JS VMs support them.
622uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
623 uc32 x = c - '0';
624 for (int i = 0; i < length; i++) {
625 int d = c0_ - '0';
626 if (d < 0 || d > 7) break;
627 int nx = x * 8 + d;
628 if (nx >= 256) break;
629 x = nx;
630 Advance();
631 }
632 return x;
633}
634
635
636void Scanner::ScanEscape() {
637 uc32 c = c0_;
638 Advance();
639
640 // Skip escaped newlines.
641 if (kIsLineTerminator.get(c)) {
642 // Allow CR+LF newlines in multiline string literals.
643 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
644 // Allow LF+CR newlines in multiline string literals.
645 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
646 return;
647 }
648
649 switch (c) {
650 case '\'': // fall through
651 case '"' : // fall through
652 case '\\': break;
653 case 'b' : c = '\b'; break;
654 case 'f' : c = '\f'; break;
655 case 'n' : c = '\n'; break;
656 case 'r' : c = '\r'; break;
657 case 't' : c = '\t'; break;
658 case 'u' : c = ScanHexEscape(c, 4); break;
659 case 'v' : c = '\v'; break;
660 case 'x' : c = ScanHexEscape(c, 2); break;
661 case '0' : // fall through
662 case '1' : // fall through
663 case '2' : // fall through
664 case '3' : // fall through
665 case '4' : // fall through
666 case '5' : // fall through
667 case '6' : // fall through
668 case '7' : c = ScanOctalEscape(c, 2); break;
669 }
670
671 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
672 // should be illegal, but they are commonly handled
673 // as non-escaped characters by JS VMs.
674 AddChar(c);
675}
676
677
678Token::Value Scanner::ScanString() {
679 uc32 quote = c0_;
680 Advance(); // consume quote
681
682 StartLiteral();
683 while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
684 uc32 c = c0_;
685 Advance();
686 if (c == '\\') {
687 if (c0_ < 0) return Token::ILLEGAL;
688 ScanEscape();
689 } else {
690 AddChar(c);
691 }
692 }
693 if (c0_ != quote) {
694 return Token::ILLEGAL;
695 }
696 TerminateLiteral();
697
698 Advance(); // consume quote
699 return Token::STRING;
700}
701
702
703Token::Value Scanner::Select(Token::Value tok) {
704 Advance();
705 return tok;
706}
707
708
709Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
710 Advance();
711 if (c0_ == next) {
712 Advance();
713 return then;
714 } else {
715 return else_;
716 }
717}
718
719
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000720// Returns true if any decimal digits were scanned, returns false otherwise.
721void Scanner::ScanDecimalDigits() {
722 while (IsDecimalDigit(c0_))
723 AddCharAdvance();
724}
725
726
727Token::Value Scanner::ScanNumber(bool seen_period) {
728 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
729
730 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
731
732 StartLiteral();
733 if (seen_period) {
734 // we have already seen a decimal point of the float
735 AddChar('.');
736 ScanDecimalDigits(); // we know we have at least one digit
737
738 } else {
739 // if the first character is '0' we must check for octals and hex
740 if (c0_ == '0') {
741 AddCharAdvance();
742
743 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
744 if (c0_ == 'x' || c0_ == 'X') {
745 // hex number
746 kind = HEX;
747 AddCharAdvance();
748 if (!IsHexDigit(c0_))
749 // we must have at least one hex digit after 'x'/'X'
750 return Token::ILLEGAL;
751 while (IsHexDigit(c0_))
752 AddCharAdvance();
753
754 } else if ('0' <= c0_ && c0_ <= '7') {
755 // (possible) octal number
756 kind = OCTAL;
757 while (true) {
758 if (c0_ == '8' || c0_ == '9') {
759 kind = DECIMAL;
760 break;
761 }
762 if (c0_ < '0' || '7' < c0_) break;
763 AddCharAdvance();
764 }
765 }
766 }
767
768 // Parse decimal digits and allow trailing fractional part.
769 if (kind == DECIMAL) {
770 ScanDecimalDigits(); // optional
771 if (c0_ == '.') {
772 AddCharAdvance();
773 ScanDecimalDigits(); // optional
774 }
775 }
776 }
777
778 // scan exponent, if any
779 if (c0_ == 'e' || c0_ == 'E') {
780 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
781 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
782 // scan exponent
783 AddCharAdvance();
784 if (c0_ == '+' || c0_ == '-')
785 AddCharAdvance();
786 if (!IsDecimalDigit(c0_))
787 // we must have at least one decimal digit after 'e'/'E'
788 return Token::ILLEGAL;
789 ScanDecimalDigits();
790 }
791 TerminateLiteral();
792
793 // The source character immediately following a numeric literal must
794 // not be an identifier start or a decimal digit; see ECMA-262
795 // section 7.8.3, page 17 (note that we read only one decimal digit
796 // if the value is 0).
797 if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_))
798 return Token::ILLEGAL;
799
800 return Token::NUMBER;
801}
802
803
804uc32 Scanner::ScanIdentifierUnicodeEscape() {
805 Advance();
806 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
807 Advance();
808 uc32 c = ScanHexEscape('u', 4);
809 // We do not allow a unicode escape sequence to start another
810 // unicode escape sequence.
811 if (c == '\\') return unibrow::Utf8::kBadChar;
812 return c;
813}
814
815
816Token::Value Scanner::ScanIdentifier() {
817 ASSERT(kIsIdentifierStart.get(c0_));
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000818 bool has_escapes = false;
819
820 StartLiteral();
821 // Scan identifier start character.
822 if (c0_ == '\\') {
823 has_escapes = true;
824 uc32 c = ScanIdentifierUnicodeEscape();
825 // Only allow legal identifier start characters.
826 if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
827 AddChar(c);
828 } else {
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000829 AddChar(c0_);
830 Advance();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000831 }
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000832
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000833 // Scan the rest of the identifier characters.
834 while (kIsIdentifierPart.get(c0_)) {
835 if (c0_ == '\\') {
836 has_escapes = true;
837 uc32 c = ScanIdentifierUnicodeEscape();
838 // Only allow legal identifier part characters.
839 if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
840 AddChar(c);
841 } else {
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000842 AddChar(c0_);
843 Advance();
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000844 }
845 }
846 TerminateLiteral();
847
848 // We don't have any 1-letter keywords (this is probably a common case).
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000849 if ((next_.literal_end - next_.literal_pos) == 1) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000850 return Token::IDENTIFIER;
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000851 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000852
853 // If the identifier contains unicode escapes, it must not be
854 // resolved to a keyword.
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000855 if (has_escapes) {
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000856 return Token::IDENTIFIER;
kasperl@chromium.orgb3284ad2009-05-18 06:12:45 +0000857 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000858
859 return Token::Lookup(&literals_.data()[next_.literal_pos]);
860}
861
862
863
864bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
ager@chromium.org32912102009-01-16 10:38:43 +0000865 // Checks whether the buffer contains an identifier (no escape).
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000866 if (!buffer->has_more()) return false;
867 if (!kIsIdentifierStart.get(buffer->GetNext())) return false;
868 while (buffer->has_more()) {
869 if (!kIsIdentifierPart.get(buffer->GetNext())) return false;
870 }
871 return true;
872}
873
874
875bool Scanner::ScanRegExpPattern(bool seen_equal) {
876 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
877 bool in_character_class = false;
878
879 // Previous token is either '/' or '/=', in the second case, the
880 // pattern starts at =.
881 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
882 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
883
884 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
885 // the scanner should pass uninterpreted bodies to the RegExp
886 // constructor.
887 StartLiteral();
888 if (seen_equal)
889 AddChar('=');
890
891 while (c0_ != '/' || in_character_class) {
892 if (kIsLineTerminator.get(c0_) || c0_ < 0)
893 return false;
894 if (c0_ == '\\') { // escaped character
895 AddCharAdvance();
896 if (kIsLineTerminator.get(c0_) || c0_ < 0)
897 return false;
898 AddCharAdvance();
899 } else { // unescaped character
900 if (c0_ == '[')
901 in_character_class = true;
902 if (c0_ == ']')
903 in_character_class = false;
904 AddCharAdvance();
905 }
906 }
907 Advance(); // consume '/'
908
909 TerminateLiteral();
910
911 return true;
912}
913
914bool Scanner::ScanRegExpFlags() {
915 // Scan regular expression flags.
916 StartLiteral();
ager@chromium.org6f10e412009-02-13 10:11:16 +0000917 while (kIsIdentifierPart.get(c0_)) {
918 if (c0_ == '\\') {
919 uc32 c = ScanIdentifierUnicodeEscape();
920 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
921 // We allow any escaped character, unlike the restriction on
922 // IdentifierPart when it is used to build an IdentifierName.
923 AddChar(c);
924 continue;
925 }
926 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000927 AddCharAdvance();
ager@chromium.org6f10e412009-02-13 10:11:16 +0000928 }
christian.plesner.hansen43d26ec2008-07-03 15:10:15 +0000929 TerminateLiteral();
930
931 next_.location.end_pos = source_pos() - 1;
932 return true;
933}
934
935} } // namespace v8::internal