blob: e15ef416c9c4a8201bfc134db0a73f7928e565b4 [file] [log] [blame]
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +00001// Copyright 2011 the V8 project authors. All rights reserved.
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28// Features shared by parsing and pre-parsing scanners.
29
erik.corry@gmail.com4a6c3272010-11-18 12:04:40 +000030#include "../include/v8stdint.h"
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +000031#include "scanner-base.h"
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000032#include "char-predicates-inl.h"
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +000033
34namespace v8 {
35namespace internal {
36
37// ----------------------------------------------------------------------------
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000038// Scanner
39
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +000040Scanner::Scanner(UnicodeCache* unicode_cache)
lrn@chromium.org1c092762011-05-09 09:42:16 +000041 : unicode_cache_(unicode_cache) { }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000042
43
44uc32 Scanner::ScanHexEscape(uc32 c, int length) {
45 ASSERT(length <= 4); // prevent overflow
46
47 uc32 digits[4];
48 uc32 x = 0;
49 for (int i = 0; i < length; i++) {
50 digits[i] = c0_;
51 int d = HexValue(c0_);
52 if (d < 0) {
53 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
54 // should be illegal, but other JS VMs just return the
55 // non-escaped version of the original character.
56
57 // Push back digits read, except the last one (in c0_).
58 for (int j = i-1; j >= 0; j--) {
59 PushBack(digits[j]);
60 }
61 // Notice: No handling of error - treat it as "\u"->"u".
62 return c;
63 }
64 x = x * 16 + d;
65 Advance();
66 }
67
68 return x;
69}
70
71
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000072
73// ----------------------------------------------------------------------------
74// JavaScriptScanner
75
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +000076JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
lrn@chromium.org1c092762011-05-09 09:42:16 +000077 : Scanner(scanner_contants), octal_pos_(Location::invalid()) { }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +000078
79
80Token::Value JavaScriptScanner::Next() {
81 current_ = next_;
82 has_line_terminator_before_next_ = false;
83 Scan();
84 return current_.token;
85}
86
87
88static inline bool IsByteOrderMark(uc32 c) {
89 // The Unicode value U+FFFE is guaranteed never to be assigned as a
90 // Unicode character; this implies that in a Unicode context the
91 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
92 // character expressed in little-endian byte order (since it could
93 // not be a U+FFFE character expressed in big-endian byte
94 // order). Nevertheless, we check for it to be compatible with
95 // Spidermonkey.
96 return c == 0xFEFF || c == 0xFFFE;
97}
98
99
100bool JavaScriptScanner::SkipWhiteSpace() {
101 int start_position = source_pos();
102
103 while (true) {
104 // We treat byte-order marks (BOMs) as whitespace for better
105 // compatibility with Spidermonkey and other JavaScript engines.
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000106 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000107 // IsWhiteSpace() includes line terminators!
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000108 if (unicode_cache_->IsLineTerminator(c0_)) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000109 // Ignore line terminators, but remember them. This is necessary
110 // for automatic semicolon insertion.
111 has_line_terminator_before_next_ = true;
112 }
113 Advance();
114 }
115
116 // If there is an HTML comment end '-->' at the beginning of a
117 // line (with only whitespace in front of it), we treat the rest
118 // of the line as a comment. This is in line with the way
119 // SpiderMonkey handles it.
120 if (c0_ == '-' && has_line_terminator_before_next_) {
121 Advance();
122 if (c0_ == '-') {
123 Advance();
124 if (c0_ == '>') {
125 // Treat the rest of the line as a comment.
126 SkipSingleLineComment();
127 // Continue skipping white space after the comment.
128 continue;
129 }
130 PushBack('-'); // undo Advance()
131 }
132 PushBack('-'); // undo Advance()
133 }
134 // Return whether or not we skipped any characters.
135 return source_pos() != start_position;
136 }
137}
138
139
140Token::Value JavaScriptScanner::SkipSingleLineComment() {
141 Advance();
142
143 // The line terminator at the end of the line is not considered
144 // to be part of the single-line comment; it is recognized
145 // separately by the lexical grammar and becomes part of the
146 // stream of input elements for the syntactic grammar (see
147 // ECMA-262, section 7.4, page 12).
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000148 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000149 Advance();
150 }
151
152 return Token::WHITESPACE;
153}
154
155
156Token::Value JavaScriptScanner::SkipMultiLineComment() {
157 ASSERT(c0_ == '*');
158 Advance();
159
160 while (c0_ >= 0) {
161 char ch = c0_;
162 Advance();
163 // If we have reached the end of the multi-line comment, we
164 // consume the '/' and insert a whitespace. This way all
165 // multi-line comments are treated as whitespace - even the ones
166 // containing line terminators. This contradicts ECMA-262, section
167 // 7.4, page 12, that says that multi-line comments containing
168 // line terminators should be treated as a line terminator, but it
169 // matches the behaviour of SpiderMonkey and KJS.
170 if (ch == '*' && c0_ == '/') {
171 c0_ = ' ';
172 return Token::WHITESPACE;
173 }
174 }
175
176 // Unterminated multi-line comment.
177 return Token::ILLEGAL;
178}
179
180
181Token::Value JavaScriptScanner::ScanHtmlComment() {
182 // Check for <!-- comments.
183 ASSERT(c0_ == '!');
184 Advance();
185 if (c0_ == '-') {
186 Advance();
187 if (c0_ == '-') return SkipSingleLineComment();
188 PushBack('-'); // undo Advance()
189 }
190 PushBack('!'); // undo Advance()
191 ASSERT(c0_ == '!');
192 return Token::LT;
193}
194
195
196void JavaScriptScanner::Scan() {
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000197 next_.literal_chars = NULL;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000198 Token::Value token;
199 do {
200 // Remember the position of the next token
201 next_.location.beg_pos = source_pos();
202
203 switch (c0_) {
204 case ' ':
205 case '\t':
206 Advance();
207 token = Token::WHITESPACE;
208 break;
209
210 case '\n':
211 Advance();
212 has_line_terminator_before_next_ = true;
213 token = Token::WHITESPACE;
214 break;
215
216 case '"': case '\'':
217 token = ScanString();
218 break;
219
220 case '<':
221 // < <= << <<= <!--
222 Advance();
223 if (c0_ == '=') {
224 token = Select(Token::LTE);
225 } else if (c0_ == '<') {
226 token = Select('=', Token::ASSIGN_SHL, Token::SHL);
227 } else if (c0_ == '!') {
228 token = ScanHtmlComment();
229 } else {
230 token = Token::LT;
231 }
232 break;
233
234 case '>':
235 // > >= >> >>= >>> >>>=
236 Advance();
237 if (c0_ == '=') {
238 token = Select(Token::GTE);
239 } else if (c0_ == '>') {
240 // >> >>= >>> >>>=
241 Advance();
242 if (c0_ == '=') {
243 token = Select(Token::ASSIGN_SAR);
244 } else if (c0_ == '>') {
245 token = Select('=', Token::ASSIGN_SHR, Token::SHR);
246 } else {
247 token = Token::SAR;
248 }
249 } else {
250 token = Token::GT;
251 }
252 break;
253
254 case '=':
255 // = == ===
256 Advance();
257 if (c0_ == '=') {
258 token = Select('=', Token::EQ_STRICT, Token::EQ);
259 } else {
260 token = Token::ASSIGN;
261 }
262 break;
263
264 case '!':
265 // ! != !==
266 Advance();
267 if (c0_ == '=') {
268 token = Select('=', Token::NE_STRICT, Token::NE);
269 } else {
270 token = Token::NOT;
271 }
272 break;
273
274 case '+':
275 // + ++ +=
276 Advance();
277 if (c0_ == '+') {
278 token = Select(Token::INC);
279 } else if (c0_ == '=') {
280 token = Select(Token::ASSIGN_ADD);
281 } else {
282 token = Token::ADD;
283 }
284 break;
285
286 case '-':
287 // - -- --> -=
288 Advance();
289 if (c0_ == '-') {
290 Advance();
291 if (c0_ == '>' && has_line_terminator_before_next_) {
292 // For compatibility with SpiderMonkey, we skip lines that
293 // start with an HTML comment end '-->'.
294 token = SkipSingleLineComment();
295 } else {
296 token = Token::DEC;
297 }
298 } else if (c0_ == '=') {
299 token = Select(Token::ASSIGN_SUB);
300 } else {
301 token = Token::SUB;
302 }
303 break;
304
305 case '*':
306 // * *=
307 token = Select('=', Token::ASSIGN_MUL, Token::MUL);
308 break;
309
310 case '%':
311 // % %=
312 token = Select('=', Token::ASSIGN_MOD, Token::MOD);
313 break;
314
315 case '/':
316 // / // /* /=
317 Advance();
318 if (c0_ == '/') {
319 token = SkipSingleLineComment();
320 } else if (c0_ == '*') {
321 token = SkipMultiLineComment();
322 } else if (c0_ == '=') {
323 token = Select(Token::ASSIGN_DIV);
324 } else {
325 token = Token::DIV;
326 }
327 break;
328
329 case '&':
330 // & && &=
331 Advance();
332 if (c0_ == '&') {
333 token = Select(Token::AND);
334 } else if (c0_ == '=') {
335 token = Select(Token::ASSIGN_BIT_AND);
336 } else {
337 token = Token::BIT_AND;
338 }
339 break;
340
341 case '|':
342 // | || |=
343 Advance();
344 if (c0_ == '|') {
345 token = Select(Token::OR);
346 } else if (c0_ == '=') {
347 token = Select(Token::ASSIGN_BIT_OR);
348 } else {
349 token = Token::BIT_OR;
350 }
351 break;
352
353 case '^':
354 // ^ ^=
355 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
356 break;
357
358 case '.':
359 // . Number
360 Advance();
361 if (IsDecimalDigit(c0_)) {
362 token = ScanNumber(true);
363 } else {
364 token = Token::PERIOD;
365 }
366 break;
367
368 case ':':
369 token = Select(Token::COLON);
370 break;
371
372 case ';':
373 token = Select(Token::SEMICOLON);
374 break;
375
376 case ',':
377 token = Select(Token::COMMA);
378 break;
379
380 case '(':
381 token = Select(Token::LPAREN);
382 break;
383
384 case ')':
385 token = Select(Token::RPAREN);
386 break;
387
388 case '[':
389 token = Select(Token::LBRACK);
390 break;
391
392 case ']':
393 token = Select(Token::RBRACK);
394 break;
395
396 case '{':
397 token = Select(Token::LBRACE);
398 break;
399
400 case '}':
401 token = Select(Token::RBRACE);
402 break;
403
404 case '?':
405 token = Select(Token::CONDITIONAL);
406 break;
407
408 case '~':
409 token = Select(Token::BIT_NOT);
410 break;
411
412 default:
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000413 if (unicode_cache_->IsIdentifierStart(c0_)) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000414 token = ScanIdentifierOrKeyword();
415 } else if (IsDecimalDigit(c0_)) {
416 token = ScanNumber(false);
417 } else if (SkipWhiteSpace()) {
418 token = Token::WHITESPACE;
419 } else if (c0_ < 0) {
420 token = Token::EOS;
421 } else {
422 token = Select(Token::ILLEGAL);
423 }
424 break;
425 }
426
427 // Continue scanning for tokens as long as we're just skipping
428 // whitespace.
429 } while (token == Token::WHITESPACE);
430
431 next_.location.end_pos = source_pos();
432 next_.token = token;
433}
434
435
436void JavaScriptScanner::SeekForward(int pos) {
ager@chromium.org5f0c45f2010-12-17 08:51:21 +0000437 // After this call, we will have the token at the given position as
438 // the "next" token. The "current" token will be invalid.
439 if (pos == next_.location.beg_pos) return;
440 int current_pos = source_pos();
441 ASSERT_EQ(next_.location.end_pos, current_pos);
442 // Positions inside the lookahead token aren't supported.
443 ASSERT(pos >= current_pos);
444 if (pos != current_pos) {
445 source_->SeekForward(pos - source_->pos());
446 Advance();
447 // This function is only called to seek to the location
448 // of the end of a function (at the "}" token). It doesn't matter
449 // whether there was a line terminator in the part we skip.
450 has_line_terminator_before_next_ = false;
451 }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000452 Scan();
453}
454
455
456void JavaScriptScanner::ScanEscape() {
457 uc32 c = c0_;
458 Advance();
459
460 // Skip escaped newlines.
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000461 if (unicode_cache_->IsLineTerminator(c)) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000462 // Allow CR+LF newlines in multiline string literals.
463 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
464 // Allow LF+CR newlines in multiline string literals.
465 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
466 return;
467 }
468
469 switch (c) {
470 case '\'': // fall through
471 case '"' : // fall through
472 case '\\': break;
473 case 'b' : c = '\b'; break;
474 case 'f' : c = '\f'; break;
475 case 'n' : c = '\n'; break;
476 case 'r' : c = '\r'; break;
477 case 't' : c = '\t'; break;
478 case 'u' : c = ScanHexEscape(c, 4); break;
479 case 'v' : c = '\v'; break;
480 case 'x' : c = ScanHexEscape(c, 2); break;
481 case '0' : // fall through
482 case '1' : // fall through
483 case '2' : // fall through
484 case '3' : // fall through
485 case '4' : // fall through
486 case '5' : // fall through
487 case '6' : // fall through
488 case '7' : c = ScanOctalEscape(c, 2); break;
489 }
490
491 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
492 // should be illegal, but they are commonly handled
493 // as non-escaped characters by JS VMs.
494 AddLiteralChar(c);
495}
496
497
lrn@chromium.org1c092762011-05-09 09:42:16 +0000498// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
499// ECMA-262. Other JS VMs support them.
500uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) {
501 uc32 x = c - '0';
502 int i = 0;
503 for (; i < length; i++) {
504 int d = c0_ - '0';
505 if (d < 0 || d > 7) break;
506 int nx = x * 8 + d;
507 if (nx >= 256) break;
508 x = nx;
509 Advance();
510 }
511 // Anything except '\0' is an octal escape sequence, illegal in strict mode.
512 // Remember the position of octal escape sequences so that an error
513 // can be reported later (in strict mode).
514 // We don't report the error immediately, because the octal escape can
515 // occur before the "use strict" directive.
516 if (c != '0' || i > 0) {
517 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
518 }
519 return x;
520}
521
522
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000523Token::Value JavaScriptScanner::ScanString() {
524 uc32 quote = c0_;
525 Advance(); // consume quote
526
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000527 LiteralScope literal(this);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000528 while (c0_ != quote && c0_ >= 0
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000529 && !unicode_cache_->IsLineTerminator(c0_)) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000530 uc32 c = c0_;
531 Advance();
532 if (c == '\\') {
533 if (c0_ < 0) return Token::ILLEGAL;
534 ScanEscape();
535 } else {
536 AddLiteralChar(c);
537 }
538 }
539 if (c0_ != quote) return Token::ILLEGAL;
540 literal.Complete();
541
542 Advance(); // consume quote
543 return Token::STRING;
544}
545
546
547void JavaScriptScanner::ScanDecimalDigits() {
548 while (IsDecimalDigit(c0_))
549 AddLiteralCharAdvance();
550}
551
552
553Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
554 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
555
556 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
557
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000558 LiteralScope literal(this);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000559 if (seen_period) {
560 // we have already seen a decimal point of the float
561 AddLiteralChar('.');
562 ScanDecimalDigits(); // we know we have at least one digit
563
564 } else {
565 // if the first character is '0' we must check for octals and hex
566 if (c0_ == '0') {
lrn@chromium.org1c092762011-05-09 09:42:16 +0000567 int start_pos = source_pos(); // For reporting octal positions.
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000568 AddLiteralCharAdvance();
569
570 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
571 if (c0_ == 'x' || c0_ == 'X') {
572 // hex number
573 kind = HEX;
574 AddLiteralCharAdvance();
575 if (!IsHexDigit(c0_)) {
576 // we must have at least one hex digit after 'x'/'X'
577 return Token::ILLEGAL;
578 }
579 while (IsHexDigit(c0_)) {
580 AddLiteralCharAdvance();
581 }
582 } else if ('0' <= c0_ && c0_ <= '7') {
583 // (possible) octal number
584 kind = OCTAL;
585 while (true) {
586 if (c0_ == '8' || c0_ == '9') {
587 kind = DECIMAL;
588 break;
589 }
ager@chromium.org0ee099b2011-01-25 14:06:47 +0000590 if (c0_ < '0' || '7' < c0_) {
591 // Octal literal finished.
lrn@chromium.org1c092762011-05-09 09:42:16 +0000592 octal_pos_ = Location(start_pos, source_pos());
ager@chromium.org0ee099b2011-01-25 14:06:47 +0000593 break;
594 }
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000595 AddLiteralCharAdvance();
596 }
597 }
598 }
599
600 // Parse decimal digits and allow trailing fractional part.
601 if (kind == DECIMAL) {
602 ScanDecimalDigits(); // optional
603 if (c0_ == '.') {
604 AddLiteralCharAdvance();
605 ScanDecimalDigits(); // optional
606 }
607 }
608 }
609
610 // scan exponent, if any
611 if (c0_ == 'e' || c0_ == 'E') {
612 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
613 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
614 // scan exponent
615 AddLiteralCharAdvance();
616 if (c0_ == '+' || c0_ == '-')
617 AddLiteralCharAdvance();
618 if (!IsDecimalDigit(c0_)) {
619 // we must have at least one decimal digit after 'e'/'E'
620 return Token::ILLEGAL;
621 }
622 ScanDecimalDigits();
623 }
624
625 // The source character immediately following a numeric literal must
626 // not be an identifier start or a decimal digit; see ECMA-262
627 // section 7.8.3, page 17 (note that we read only one decimal digit
628 // if the value is 0).
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000629 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000630 return Token::ILLEGAL;
631
632 literal.Complete();
633
634 return Token::NUMBER;
635}
636
637
638uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
639 Advance();
640 if (c0_ != 'u') return unibrow::Utf8::kBadChar;
641 Advance();
642 uc32 c = ScanHexEscape('u', 4);
643 // We do not allow a unicode escape sequence to start another
644 // unicode escape sequence.
645 if (c == '\\') return unibrow::Utf8::kBadChar;
646 return c;
647}
648
649
650Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000651 ASSERT(unicode_cache_->IsIdentifierStart(c0_));
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000652 LiteralScope literal(this);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000653 KeywordMatcher keyword_match;
654 // Scan identifier start character.
655 if (c0_ == '\\') {
656 uc32 c = ScanIdentifierUnicodeEscape();
657 // Only allow legal identifier start characters.
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000658 if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000659 AddLiteralChar(c);
660 return ScanIdentifierSuffix(&literal);
661 }
662
663 uc32 first_char = c0_;
664 Advance();
665 AddLiteralChar(first_char);
666 if (!keyword_match.AddChar(first_char)) {
667 return ScanIdentifierSuffix(&literal);
668 }
669
670 // Scan the rest of the identifier characters.
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000671 while (unicode_cache_->IsIdentifierPart(c0_)) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000672 if (c0_ != '\\') {
673 uc32 next_char = c0_;
674 Advance();
675 AddLiteralChar(next_char);
676 if (keyword_match.AddChar(next_char)) continue;
677 }
678 // Fallthrough if no loner able to complete keyword.
679 return ScanIdentifierSuffix(&literal);
680 }
681 literal.Complete();
682
683 return keyword_match.token();
684}
685
686
687Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
688 // Scan the rest of the identifier characters.
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000689 while (unicode_cache_->IsIdentifierPart(c0_)) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000690 if (c0_ == '\\') {
691 uc32 c = ScanIdentifierUnicodeEscape();
692 // Only allow legal identifier part characters.
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000693 if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000694 AddLiteralChar(c);
695 } else {
696 AddLiteralChar(c0_);
697 Advance();
698 }
699 }
700 literal->Complete();
701
702 return Token::IDENTIFIER;
703}
704
705
706bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
707 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
708 bool in_character_class = false;
709
710 // Previous token is either '/' or '/=', in the second case, the
711 // pattern starts at =.
712 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
713 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
714
715 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
716 // the scanner should pass uninterpreted bodies to the RegExp
717 // constructor.
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000718 LiteralScope literal(this);
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000719 if (seen_equal)
720 AddLiteralChar('=');
721
722 while (c0_ != '/' || in_character_class) {
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000723 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
kmillikin@chromium.orgd2c22f02011-01-10 08:15:37 +0000724 if (c0_ == '\\') { // Escape sequence.
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000725 AddLiteralCharAdvance();
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000726 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000727 AddLiteralCharAdvance();
kmillikin@chromium.orgd2c22f02011-01-10 08:15:37 +0000728 // If the escape allows more characters, i.e., \x??, \u????, or \c?,
729 // only "safe" characters are allowed (letters, digits, underscore),
730 // otherwise the escape isn't valid and the invalid character has
731 // its normal meaning. I.e., we can just continue scanning without
732 // worrying whether the following characters are part of the escape
733 // or not, since any '/', '\\' or '[' is guaranteed to not be part
734 // of the escape sequence.
lrn@chromium.org1c092762011-05-09 09:42:16 +0000735
736 // TODO(896): At some point, parse RegExps more throughly to capture
737 // octal esacpes in strict mode.
kmillikin@chromium.orgd2c22f02011-01-10 08:15:37 +0000738 } else { // Unescaped character.
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000739 if (c0_ == '[') in_character_class = true;
740 if (c0_ == ']') in_character_class = false;
741 AddLiteralCharAdvance();
742 }
743 }
744 Advance(); // consume '/'
745
746 literal.Complete();
747
748 return true;
749}
750
751
752bool JavaScriptScanner::ScanRegExpFlags() {
753 // Scan regular expression flags.
fschneider@chromium.org9e3e0b62011-01-03 10:16:46 +0000754 LiteralScope literal(this);
ager@chromium.orga9aa5fa2011-04-13 08:46:07 +0000755 while (unicode_cache_->IsIdentifierPart(c0_)) {
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000756 if (c0_ == '\\') {
757 uc32 c = ScanIdentifierUnicodeEscape();
758 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
759 // We allow any escaped character, unlike the restriction on
760 // IdentifierPart when it is used to build an IdentifierName.
761 AddLiteralChar(c);
762 continue;
763 }
764 }
765 AddLiteralCharAdvance();
766 }
767 literal.Complete();
768
769 next_.location.end_pos = source_pos() - 1;
770 return true;
771}
772
773// ----------------------------------------------------------------------------
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000774// Keyword Matcher
775
776KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
777 { "break", KEYWORD_PREFIX, Token::BREAK },
778 { NULL, C, Token::ILLEGAL },
779 { NULL, D, Token::ILLEGAL },
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000780 { NULL, E, Token::ILLEGAL },
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000781 { NULL, F, Token::ILLEGAL },
782 { NULL, UNMATCHABLE, Token::ILLEGAL },
783 { NULL, UNMATCHABLE, Token::ILLEGAL },
784 { NULL, I, Token::ILLEGAL },
785 { NULL, UNMATCHABLE, Token::ILLEGAL },
786 { NULL, UNMATCHABLE, Token::ILLEGAL },
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000787 { "let", KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD },
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000788 { NULL, UNMATCHABLE, Token::ILLEGAL },
789 { NULL, N, Token::ILLEGAL },
790 { NULL, UNMATCHABLE, Token::ILLEGAL },
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000791 { NULL, P, Token::ILLEGAL },
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000792 { NULL, UNMATCHABLE, Token::ILLEGAL },
793 { "return", KEYWORD_PREFIX, Token::RETURN },
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000794 { NULL, S, Token::ILLEGAL },
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000795 { NULL, T, Token::ILLEGAL },
796 { NULL, UNMATCHABLE, Token::ILLEGAL },
797 { NULL, V, Token::ILLEGAL },
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000798 { NULL, W, Token::ILLEGAL },
799 { NULL, UNMATCHABLE, Token::ILLEGAL },
800 { "yield", KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD }
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000801};
802
803
804void KeywordMatcher::Step(unibrow::uchar input) {
805 switch (state_) {
806 case INITIAL: {
807 // matching the first character is the only state with significant fanout.
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000808 // Match only lower-case letters in range 'b'..'y'.
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000809 unsigned int offset = input - kFirstCharRangeMin;
810 if (offset < kFirstCharRangeLength) {
811 state_ = first_states_[offset].state;
812 if (state_ == KEYWORD_PREFIX) {
813 keyword_ = first_states_[offset].keyword;
814 counter_ = 1;
815 keyword_token_ = first_states_[offset].token;
816 }
817 return;
818 }
819 break;
820 }
821 case KEYWORD_PREFIX:
822 if (static_cast<unibrow::uchar>(keyword_[counter_]) == input) {
823 counter_++;
824 if (keyword_[counter_] == '\0') {
825 state_ = KEYWORD_MATCHED;
826 token_ = keyword_token_;
827 }
828 return;
829 }
830 break;
831 case KEYWORD_MATCHED:
832 token_ = Token::IDENTIFIER;
833 break;
834 case C:
835 if (MatchState(input, 'a', CA)) return;
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000836 if (MatchKeywordStart(input, "class", 1,
837 Token::FUTURE_RESERVED_WORD)) return;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000838 if (MatchState(input, 'o', CO)) return;
839 break;
840 case CA:
841 if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
842 if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
843 break;
844 case CO:
845 if (MatchState(input, 'n', CON)) return;
846 break;
847 case CON:
848 if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
849 if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
850 break;
851 case D:
852 if (MatchState(input, 'e', DE)) return;
853 if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
854 break;
855 case DE:
856 if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
857 if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
858 if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
859 break;
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000860 case E:
861 if (MatchKeywordStart(input, "else", 1, Token::ELSE)) return;
862 if (MatchKeywordStart(input, "enum", 1,
863 Token::FUTURE_RESERVED_WORD)) return;
864 if (MatchState(input, 'x', EX)) return;
865 break;
866 case EX:
867 if (MatchKeywordStart(input, "export", 2,
868 Token::FUTURE_RESERVED_WORD)) return;
869 if (MatchKeywordStart(input, "extends", 2,
870 Token::FUTURE_RESERVED_WORD)) return;
871 break;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000872 case F:
873 if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
874 if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
875 if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
876 if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
877 break;
878 case I:
879 if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000880 if (MatchState(input, 'm', IM)) return;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000881 if (MatchKeyword(input, 'n', IN, Token::IN)) return;
882 break;
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000883 case IM:
884 if (MatchState(input, 'p', IMP)) return;
885 break;
886 case IMP:
887 if (MatchKeywordStart(input, "implements", 3,
888 Token::FUTURE_RESERVED_WORD )) return;
889 if (MatchKeywordStart(input, "import", 3,
890 Token::FUTURE_RESERVED_WORD)) return;
891 break;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000892 case IN:
893 token_ = Token::IDENTIFIER;
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000894 if (MatchKeywordStart(input, "interface", 2,
895 Token::FUTURE_RESERVED_WORD)) return;
vegorov@chromium.org21b5e952010-11-23 10:24:40 +0000896 if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) return;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000897 break;
898 case N:
899 if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
900 if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
901 if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
902 break;
ricow@chromium.org83aa5492011-02-07 12:42:56 +0000903 case P:
904 if (MatchKeywordStart(input, "package", 1,
905 Token::FUTURE_RESERVED_WORD)) return;
906 if (MatchState(input, 'r', PR)) return;
907 if (MatchKeywordStart(input, "public", 1,
908 Token::FUTURE_RESERVED_WORD)) return;
909 break;
910 case PR:
911 if (MatchKeywordStart(input, "private", 2,
912 Token::FUTURE_RESERVED_WORD)) return;
913 if (MatchKeywordStart(input, "protected", 2,
914 Token::FUTURE_RESERVED_WORD)) return;
915 break;
916 case S:
917 if (MatchKeywordStart(input, "static", 1,
918 Token::FUTURE_RESERVED_WORD)) return;
919 if (MatchKeywordStart(input, "super", 1,
920 Token::FUTURE_RESERVED_WORD)) return;
921 if (MatchKeywordStart(input, "switch", 1,
922 Token::SWITCH)) return;
923 break;
whesse@chromium.orgf0ac72d2010-11-08 12:47:26 +0000924 case T:
925 if (MatchState(input, 'h', TH)) return;
926 if (MatchState(input, 'r', TR)) return;
927 if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
928 break;
929 case TH:
930 if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
931 if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
932 break;
933 case TR:
934 if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
935 if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
936 break;
937 case V:
938 if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
939 if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
940 break;
941 case W:
942 if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
943 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
944 break;
945 case UNMATCHABLE:
946 break;
947 }
948 // On fallthrough, it's a failure.
949 state_ = UNMATCHABLE;
950}
951
952} } // namespace v8::internal