Blame - src/scanner-base.cc - fp2-dev/platform/external/chromium_org/v8

blob: e15ef416c9c4a8201bfc134db0a73f7928e565b4 [file] [log] [blame]

ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	1	// Copyright 2011 the V8 project authors. All rights reserved.
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	// Features shared by parsing and pre-parsing scanners.
				29
erik.corry@gmail.com	4a6c327	2010-11-18 12:04:40 +0000	[diff] [blame]	30	#include "../include/v8stdint.h"
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	31	#include "scanner-base.h"
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	32	#include "char-predicates-inl.h"
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	33
				34	namespace v8 {
				35	namespace internal {
				36
				37	// ----------------------------------------------------------------------------
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	38	// Scanner
				39
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	40	Scanner::Scanner(UnicodeCache* unicode_cache)
lrn@chromium.org	1c09276	2011-05-09 09:42:16 +0000	[diff] [blame]	41	: unicode_cache_(unicode_cache) { }
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	42
				43
				44	uc32 Scanner::ScanHexEscape(uc32 c, int length) {
				45	ASSERT(length <= 4); // prevent overflow
				46
				47	uc32 digits[4];
				48	uc32 x = 0;
				49	for (int i = 0; i < length; i++) {
				50	digits[i] = c0_;
				51	int d = HexValue(c0_);
				52	if (d < 0) {
				53	// According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
				54	// should be illegal, but other JS VMs just return the
				55	// non-escaped version of the original character.
				56
				57	// Push back digits read, except the last one (in c0_).
				58	for (int j = i-1; j >= 0; j--) {
				59	PushBack(digits[j]);
				60	}
				61	// Notice: No handling of error - treat it as "\u"->"u".
				62	return c;
				63	}
				64	x = x * 16 + d;
				65	Advance();
				66	}
				67
				68	return x;
				69	}
				70
				71
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	72
				73	// ----------------------------------------------------------------------------
				74	// JavaScriptScanner
				75
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	76	JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
lrn@chromium.org	1c09276	2011-05-09 09:42:16 +0000	[diff] [blame]	77	: Scanner(scanner_contants), octal_pos_(Location::invalid()) { }
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	78
				79
				80	Token::Value JavaScriptScanner::Next() {
				81	current_ = next_;
				82	has_line_terminator_before_next_ = false;
				83	Scan();
				84	return current_.token;
				85	}
				86
				87
				88	static inline bool IsByteOrderMark(uc32 c) {
				89	// The Unicode value U+FFFE is guaranteed never to be assigned as a
				90	// Unicode character; this implies that in a Unicode context the
				91	// 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
				92	// character expressed in little-endian byte order (since it could
				93	// not be a U+FFFE character expressed in big-endian byte
				94	// order). Nevertheless, we check for it to be compatible with
				95	// Spidermonkey.
				96	return c == 0xFEFF \|\| c == 0xFFFE;
				97	}
				98
				99
				100	bool JavaScriptScanner::SkipWhiteSpace() {
				101	int start_position = source_pos();
				102
				103	while (true) {
				104	// We treat byte-order marks (BOMs) as whitespace for better
				105	// compatibility with Spidermonkey and other JavaScript engines.
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	106	while (unicode_cache_->IsWhiteSpace(c0_) \|\| IsByteOrderMark(c0_)) {
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	107	// IsWhiteSpace() includes line terminators!
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	108	if (unicode_cache_->IsLineTerminator(c0_)) {
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	109	// Ignore line terminators, but remember them. This is necessary
				110	// for automatic semicolon insertion.
				111	has_line_terminator_before_next_ = true;
				112	}
				113	Advance();
				114	}
				115
				116	// If there is an HTML comment end '-->' at the beginning of a
				117	// line (with only whitespace in front of it), we treat the rest
				118	// of the line as a comment. This is in line with the way
				119	// SpiderMonkey handles it.
				120	if (c0_ == '-' && has_line_terminator_before_next_) {
				121	Advance();
				122	if (c0_ == '-') {
				123	Advance();
				124	if (c0_ == '>') {
				125	// Treat the rest of the line as a comment.
				126	SkipSingleLineComment();
				127	// Continue skipping white space after the comment.
				128	continue;
				129	}
				130	PushBack('-'); // undo Advance()
				131	}
				132	PushBack('-'); // undo Advance()
				133	}
				134	// Return whether or not we skipped any characters.
				135	return source_pos() != start_position;
				136	}
				137	}
				138
				139
				140	Token::Value JavaScriptScanner::SkipSingleLineComment() {
				141	Advance();
				142
				143	// The line terminator at the end of the line is not considered
				144	// to be part of the single-line comment; it is recognized
				145	// separately by the lexical grammar and becomes part of the
				146	// stream of input elements for the syntactic grammar (see
				147	// ECMA-262, section 7.4, page 12).
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	148	while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	149	Advance();
				150	}
				151
				152	return Token::WHITESPACE;
				153	}
				154
				155
				156	Token::Value JavaScriptScanner::SkipMultiLineComment() {
				157	ASSERT(c0_ == '*');
				158	Advance();
				159
				160	while (c0_ >= 0) {
				161	char ch = c0_;
				162	Advance();
				163	// If we have reached the end of the multi-line comment, we
				164	// consume the '/' and insert a whitespace. This way all
				165	// multi-line comments are treated as whitespace - even the ones
				166	// containing line terminators. This contradicts ECMA-262, section
				167	// 7.4, page 12, that says that multi-line comments containing
				168	// line terminators should be treated as a line terminator, but it
				169	// matches the behaviour of SpiderMonkey and KJS.
				170	if (ch == '*' && c0_ == '/') {
				171	c0_ = ' ';
				172	return Token::WHITESPACE;
				173	}
				174	}
				175
				176	// Unterminated multi-line comment.
				177	return Token::ILLEGAL;
				178	}
				179
				180
				181	Token::Value JavaScriptScanner::ScanHtmlComment() {
				182	// Check for <!-- comments.
				183	ASSERT(c0_ == '!');
				184	Advance();
				185	if (c0_ == '-') {
				186	Advance();
				187	if (c0_ == '-') return SkipSingleLineComment();
				188	PushBack('-'); // undo Advance()
				189	}
				190	PushBack('!'); // undo Advance()
				191	ASSERT(c0_ == '!');
				192	return Token::LT;
				193	}
				194
				195
				196	void JavaScriptScanner::Scan() {
fschneider@chromium.org	9e3e0b6	2011-01-03 10:16:46 +0000	[diff] [blame]	197	next_.literal_chars = NULL;
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	198	Token::Value token;
				199	do {
				200	// Remember the position of the next token
				201	next_.location.beg_pos = source_pos();
				202
				203	switch (c0_) {
				204	case ' ':
				205	case '\t':
				206	Advance();
				207	token = Token::WHITESPACE;
				208	break;
				209
				210	case '\n':
				211	Advance();
				212	has_line_terminator_before_next_ = true;
				213	token = Token::WHITESPACE;
				214	break;
				215
				216	case '"': case '\'':
				217	token = ScanString();
				218	break;
				219
				220	case '<':
				221	// < <= << <<= <!--
				222	Advance();
				223	if (c0_ == '=') {
				224	token = Select(Token::LTE);
				225	} else if (c0_ == '<') {
				226	token = Select('=', Token::ASSIGN_SHL, Token::SHL);
				227	} else if (c0_ == '!') {
				228	token = ScanHtmlComment();
				229	} else {
				230	token = Token::LT;
				231	}
				232	break;
				233
				234	case '>':
				235	// > >= >> >>= >>> >>>=
				236	Advance();
				237	if (c0_ == '=') {
				238	token = Select(Token::GTE);
				239	} else if (c0_ == '>') {
				240	// >> >>= >>> >>>=
				241	Advance();
				242	if (c0_ == '=') {
				243	token = Select(Token::ASSIGN_SAR);
				244	} else if (c0_ == '>') {
				245	token = Select('=', Token::ASSIGN_SHR, Token::SHR);
				246	} else {
				247	token = Token::SAR;
				248	}
				249	} else {
				250	token = Token::GT;
				251	}
				252	break;
				253
				254	case '=':
				255	// = == ===
				256	Advance();
				257	if (c0_ == '=') {
				258	token = Select('=', Token::EQ_STRICT, Token::EQ);
				259	} else {
				260	token = Token::ASSIGN;
				261	}
				262	break;
				263
				264	case '!':
				265	// ! != !==
				266	Advance();
				267	if (c0_ == '=') {
				268	token = Select('=', Token::NE_STRICT, Token::NE);
				269	} else {
				270	token = Token::NOT;
				271	}
				272	break;
				273
				274	case '+':
				275	// + ++ +=
				276	Advance();
				277	if (c0_ == '+') {
				278	token = Select(Token::INC);
				279	} else if (c0_ == '=') {
				280	token = Select(Token::ASSIGN_ADD);
				281	} else {
				282	token = Token::ADD;
				283	}
				284	break;
				285
				286	case '-':
				287	// - -- --> -=
				288	Advance();
				289	if (c0_ == '-') {
				290	Advance();
				291	if (c0_ == '>' && has_line_terminator_before_next_) {
				292	// For compatibility with SpiderMonkey, we skip lines that
				293	// start with an HTML comment end '-->'.
				294	token = SkipSingleLineComment();
				295	} else {
				296	token = Token::DEC;
				297	}
				298	} else if (c0_ == '=') {
				299	token = Select(Token::ASSIGN_SUB);
				300	} else {
				301	token = Token::SUB;
				302	}
				303	break;
				304
				305	case '*':
				306	// * *=
				307	token = Select('=', Token::ASSIGN_MUL, Token::MUL);
				308	break;
				309
				310	case '%':
				311	// % %=
				312	token = Select('=', Token::ASSIGN_MOD, Token::MOD);
				313	break;
				314
				315	case '/':
				316	// / // /* /=
				317	Advance();
				318	if (c0_ == '/') {
				319	token = SkipSingleLineComment();
				320	} else if (c0_ == '*') {
				321	token = SkipMultiLineComment();
				322	} else if (c0_ == '=') {
				323	token = Select(Token::ASSIGN_DIV);
				324	} else {
				325	token = Token::DIV;
				326	}
				327	break;
				328
				329	case '&':
				330	// & && &=
				331	Advance();
				332	if (c0_ == '&') {
				333	token = Select(Token::AND);
				334	} else if (c0_ == '=') {
				335	token = Select(Token::ASSIGN_BIT_AND);
				336	} else {
				337	token = Token::BIT_AND;
				338	}
				339	break;
				340
				341	case '\|':
				342	// \| \|\| \|=
				343	Advance();
				344	if (c0_ == '\|') {
				345	token = Select(Token::OR);
				346	} else if (c0_ == '=') {
				347	token = Select(Token::ASSIGN_BIT_OR);
				348	} else {
				349	token = Token::BIT_OR;
				350	}
				351	break;
				352
				353	case '^':
				354	// ^ ^=
				355	token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
				356	break;
				357
				358	case '.':
				359	// . Number
				360	Advance();
				361	if (IsDecimalDigit(c0_)) {
				362	token = ScanNumber(true);
				363	} else {
				364	token = Token::PERIOD;
				365	}
				366	break;
				367
				368	case ':':
				369	token = Select(Token::COLON);
				370	break;
				371
				372	case ';':
				373	token = Select(Token::SEMICOLON);
				374	break;
				375
				376	case ',':
				377	token = Select(Token::COMMA);
				378	break;
				379
				380	case '(':
				381	token = Select(Token::LPAREN);
				382	break;
				383
				384	case ')':
				385	token = Select(Token::RPAREN);
				386	break;
				387
				388	case '[':
				389	token = Select(Token::LBRACK);
				390	break;
				391
				392	case ']':
				393	token = Select(Token::RBRACK);
				394	break;
				395
				396	case '{':
				397	token = Select(Token::LBRACE);
				398	break;
				399
				400	case '}':
				401	token = Select(Token::RBRACE);
				402	break;
				403
				404	case '?':
				405	token = Select(Token::CONDITIONAL);
				406	break;
				407
				408	case '~':
				409	token = Select(Token::BIT_NOT);
				410	break;
				411
				412	default:
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	413	if (unicode_cache_->IsIdentifierStart(c0_)) {
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	414	token = ScanIdentifierOrKeyword();
				415	} else if (IsDecimalDigit(c0_)) {
				416	token = ScanNumber(false);
				417	} else if (SkipWhiteSpace()) {
				418	token = Token::WHITESPACE;
				419	} else if (c0_ < 0) {
				420	token = Token::EOS;
				421	} else {
				422	token = Select(Token::ILLEGAL);
				423	}
				424	break;
				425	}
				426
				427	// Continue scanning for tokens as long as we're just skipping
				428	// whitespace.
				429	} while (token == Token::WHITESPACE);
				430
				431	next_.location.end_pos = source_pos();
				432	next_.token = token;
				433	}
				434
				435
				436	void JavaScriptScanner::SeekForward(int pos) {
ager@chromium.org	5f0c45f	2010-12-17 08:51:21 +0000	[diff] [blame]	437	// After this call, we will have the token at the given position as
				438	// the "next" token. The "current" token will be invalid.
				439	if (pos == next_.location.beg_pos) return;
				440	int current_pos = source_pos();
				441	ASSERT_EQ(next_.location.end_pos, current_pos);
				442	// Positions inside the lookahead token aren't supported.
				443	ASSERT(pos >= current_pos);
				444	if (pos != current_pos) {
				445	source_->SeekForward(pos - source_->pos());
				446	Advance();
				447	// This function is only called to seek to the location
				448	// of the end of a function (at the "}" token). It doesn't matter
				449	// whether there was a line terminator in the part we skip.
				450	has_line_terminator_before_next_ = false;
				451	}
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	452	Scan();
				453	}
				454
				455
				456	void JavaScriptScanner::ScanEscape() {
				457	uc32 c = c0_;
				458	Advance();
				459
				460	// Skip escaped newlines.
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	461	if (unicode_cache_->IsLineTerminator(c)) {
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	462	// Allow CR+LF newlines in multiline string literals.
				463	if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
				464	// Allow LF+CR newlines in multiline string literals.
				465	if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
				466	return;
				467	}
				468
				469	switch (c) {
				470	case '\'': // fall through
				471	case '"' : // fall through
				472	case '\\': break;
				473	case 'b' : c = '\b'; break;
				474	case 'f' : c = '\f'; break;
				475	case 'n' : c = '\n'; break;
				476	case 'r' : c = '\r'; break;
				477	case 't' : c = '\t'; break;
				478	case 'u' : c = ScanHexEscape(c, 4); break;
				479	case 'v' : c = '\v'; break;
				480	case 'x' : c = ScanHexEscape(c, 2); break;
				481	case '0' : // fall through
				482	case '1' : // fall through
				483	case '2' : // fall through
				484	case '3' : // fall through
				485	case '4' : // fall through
				486	case '5' : // fall through
				487	case '6' : // fall through
				488	case '7' : c = ScanOctalEscape(c, 2); break;
				489	}
				490
				491	// According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
				492	// should be illegal, but they are commonly handled
				493	// as non-escaped characters by JS VMs.
				494	AddLiteralChar(c);
				495	}
				496
				497
lrn@chromium.org	1c09276	2011-05-09 09:42:16 +0000	[diff] [blame]	498	// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
				499	// ECMA-262. Other JS VMs support them.
				500	uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) {
				501	uc32 x = c - '0';
				502	int i = 0;
				503	for (; i < length; i++) {
				504	int d = c0_ - '0';
				505	if (d < 0 \|\| d > 7) break;
				506	int nx = x * 8 + d;
				507	if (nx >= 256) break;
				508	x = nx;
				509	Advance();
				510	}
				511	// Anything except '\0' is an octal escape sequence, illegal in strict mode.
				512	// Remember the position of octal escape sequences so that an error
				513	// can be reported later (in strict mode).
				514	// We don't report the error immediately, because the octal escape can
				515	// occur before the "use strict" directive.
				516	if (c != '0' \|\| i > 0) {
				517	octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
				518	}
				519	return x;
				520	}
				521
				522
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	523	Token::Value JavaScriptScanner::ScanString() {
				524	uc32 quote = c0_;
				525	Advance(); // consume quote
				526
fschneider@chromium.org	9e3e0b6	2011-01-03 10:16:46 +0000	[diff] [blame]	527	LiteralScope literal(this);
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	528	while (c0_ != quote && c0_ >= 0
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	529	&& !unicode_cache_->IsLineTerminator(c0_)) {
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	530	uc32 c = c0_;
				531	Advance();
				532	if (c == '\\') {
				533	if (c0_ < 0) return Token::ILLEGAL;
				534	ScanEscape();
				535	} else {
				536	AddLiteralChar(c);
				537	}
				538	}
				539	if (c0_ != quote) return Token::ILLEGAL;
				540	literal.Complete();
				541
				542	Advance(); // consume quote
				543	return Token::STRING;
				544	}
				545
				546
				547	void JavaScriptScanner::ScanDecimalDigits() {
				548	while (IsDecimalDigit(c0_))
				549	AddLiteralCharAdvance();
				550	}
				551
				552
				553	Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
				554	ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
				555
				556	enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
				557
fschneider@chromium.org	9e3e0b6	2011-01-03 10:16:46 +0000	[diff] [blame]	558	LiteralScope literal(this);
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	559	if (seen_period) {
				560	// we have already seen a decimal point of the float
				561	AddLiteralChar('.');
				562	ScanDecimalDigits(); // we know we have at least one digit
				563
				564	} else {
				565	// if the first character is '0' we must check for octals and hex
				566	if (c0_ == '0') {
lrn@chromium.org	1c09276	2011-05-09 09:42:16 +0000	[diff] [blame]	567	int start_pos = source_pos(); // For reporting octal positions.
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	568	AddLiteralCharAdvance();
				569
				570	// either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
				571	if (c0_ == 'x' \|\| c0_ == 'X') {
				572	// hex number
				573	kind = HEX;
				574	AddLiteralCharAdvance();
				575	if (!IsHexDigit(c0_)) {
				576	// we must have at least one hex digit after 'x'/'X'
				577	return Token::ILLEGAL;
				578	}
				579	while (IsHexDigit(c0_)) {
				580	AddLiteralCharAdvance();
				581	}
				582	} else if ('0' <= c0_ && c0_ <= '7') {
				583	// (possible) octal number
				584	kind = OCTAL;
				585	while (true) {
				586	if (c0_ == '8' \|\| c0_ == '9') {
				587	kind = DECIMAL;
				588	break;
				589	}
ager@chromium.org	0ee099b	2011-01-25 14:06:47 +0000	[diff] [blame]	590	if (c0_ < '0' \|\| '7' < c0_) {
				591	// Octal literal finished.
lrn@chromium.org	1c09276	2011-05-09 09:42:16 +0000	[diff] [blame]	592	octal_pos_ = Location(start_pos, source_pos());
ager@chromium.org	0ee099b	2011-01-25 14:06:47 +0000	[diff] [blame]	593	break;
				594	}
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	595	AddLiteralCharAdvance();
				596	}
				597	}
				598	}
				599
				600	// Parse decimal digits and allow trailing fractional part.
				601	if (kind == DECIMAL) {
				602	ScanDecimalDigits(); // optional
				603	if (c0_ == '.') {
				604	AddLiteralCharAdvance();
				605	ScanDecimalDigits(); // optional
				606	}
				607	}
				608	}
				609
				610	// scan exponent, if any
				611	if (c0_ == 'e' \|\| c0_ == 'E') {
				612	ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
				613	if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
				614	// scan exponent
				615	AddLiteralCharAdvance();
				616	if (c0_ == '+' \|\| c0_ == '-')
				617	AddLiteralCharAdvance();
				618	if (!IsDecimalDigit(c0_)) {
				619	// we must have at least one decimal digit after 'e'/'E'
				620	return Token::ILLEGAL;
				621	}
				622	ScanDecimalDigits();
				623	}
				624
				625	// The source character immediately following a numeric literal must
				626	// not be an identifier start or a decimal digit; see ECMA-262
				627	// section 7.8.3, page 17 (note that we read only one decimal digit
				628	// if the value is 0).
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	629	if (IsDecimalDigit(c0_) \|\| unicode_cache_->IsIdentifierStart(c0_))
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	630	return Token::ILLEGAL;
				631
				632	literal.Complete();
				633
				634	return Token::NUMBER;
				635	}
				636
				637
				638	uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
				639	Advance();
				640	if (c0_ != 'u') return unibrow::Utf8::kBadChar;
				641	Advance();
				642	uc32 c = ScanHexEscape('u', 4);
				643	// We do not allow a unicode escape sequence to start another
				644	// unicode escape sequence.
				645	if (c == '\\') return unibrow::Utf8::kBadChar;
				646	return c;
				647	}
				648
				649
				650	Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	651	ASSERT(unicode_cache_->IsIdentifierStart(c0_));
fschneider@chromium.org	9e3e0b6	2011-01-03 10:16:46 +0000	[diff] [blame]	652	LiteralScope literal(this);
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	653	KeywordMatcher keyword_match;
				654	// Scan identifier start character.
				655	if (c0_ == '\\') {
				656	uc32 c = ScanIdentifierUnicodeEscape();
				657	// Only allow legal identifier start characters.
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	658	if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	659	AddLiteralChar(c);
				660	return ScanIdentifierSuffix(&literal);
				661	}
				662
				663	uc32 first_char = c0_;
				664	Advance();
				665	AddLiteralChar(first_char);
				666	if (!keyword_match.AddChar(first_char)) {
				667	return ScanIdentifierSuffix(&literal);
				668	}
				669
				670	// Scan the rest of the identifier characters.
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	671	while (unicode_cache_->IsIdentifierPart(c0_)) {
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	672	if (c0_ != '\\') {
				673	uc32 next_char = c0_;
				674	Advance();
				675	AddLiteralChar(next_char);
				676	if (keyword_match.AddChar(next_char)) continue;
				677	}
				678	// Fallthrough if no loner able to complete keyword.
				679	return ScanIdentifierSuffix(&literal);
				680	}
				681	literal.Complete();
				682
				683	return keyword_match.token();
				684	}
				685
				686
				687	Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
				688	// Scan the rest of the identifier characters.
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	689	while (unicode_cache_->IsIdentifierPart(c0_)) {
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	690	if (c0_ == '\\') {
				691	uc32 c = ScanIdentifierUnicodeEscape();
				692	// Only allow legal identifier part characters.
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	693	if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	694	AddLiteralChar(c);
				695	} else {
				696	AddLiteralChar(c0_);
				697	Advance();
				698	}
				699	}
				700	literal->Complete();
				701
				702	return Token::IDENTIFIER;
				703	}
				704
				705
				706	bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
				707	// Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags
				708	bool in_character_class = false;
				709
				710	// Previous token is either '/' or '/=', in the second case, the
				711	// pattern starts at =.
				712	next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
				713	next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
				714
				715	// Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
				716	// the scanner should pass uninterpreted bodies to the RegExp
				717	// constructor.
fschneider@chromium.org	9e3e0b6	2011-01-03 10:16:46 +0000	[diff] [blame]	718	LiteralScope literal(this);
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	719	if (seen_equal)
				720	AddLiteralChar('=');
				721
				722	while (c0_ != '/' \|\| in_character_class) {
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	723	if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;
kmillikin@chromium.org	d2c22f0	2011-01-10 08:15:37 +0000	[diff] [blame]	724	if (c0_ == '\\') { // Escape sequence.
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	725	AddLiteralCharAdvance();
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	726	if (unicode_cache_->IsLineTerminator(c0_) \|\| c0_ < 0) return false;
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	727	AddLiteralCharAdvance();
kmillikin@chromium.org	d2c22f0	2011-01-10 08:15:37 +0000	[diff] [blame]	728	// If the escape allows more characters, i.e., \x??, \u????, or \c?,
				729	// only "safe" characters are allowed (letters, digits, underscore),
				730	// otherwise the escape isn't valid and the invalid character has
				731	// its normal meaning. I.e., we can just continue scanning without
				732	// worrying whether the following characters are part of the escape
				733	// or not, since any '/', '\\' or '[' is guaranteed to not be part
				734	// of the escape sequence.
lrn@chromium.org	1c09276	2011-05-09 09:42:16 +0000	[diff] [blame]	735
				736	// TODO(896): At some point, parse RegExps more throughly to capture
				737	// octal esacpes in strict mode.
kmillikin@chromium.org	d2c22f0	2011-01-10 08:15:37 +0000	[diff] [blame]	738	} else { // Unescaped character.
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	739	if (c0_ == '[') in_character_class = true;
				740	if (c0_ == ']') in_character_class = false;
				741	AddLiteralCharAdvance();
				742	}
				743	}
				744	Advance(); // consume '/'
				745
				746	literal.Complete();
				747
				748	return true;
				749	}
				750
				751
				752	bool JavaScriptScanner::ScanRegExpFlags() {
				753	// Scan regular expression flags.
fschneider@chromium.org	9e3e0b6	2011-01-03 10:16:46 +0000	[diff] [blame]	754	LiteralScope literal(this);
ager@chromium.org	a9aa5fa	2011-04-13 08:46:07 +0000	[diff] [blame]	755	while (unicode_cache_->IsIdentifierPart(c0_)) {
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	756	if (c0_ == '\\') {
				757	uc32 c = ScanIdentifierUnicodeEscape();
				758	if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
				759	// We allow any escaped character, unlike the restriction on
				760	// IdentifierPart when it is used to build an IdentifierName.
				761	AddLiteralChar(c);
				762	continue;
				763	}
				764	}
				765	AddLiteralCharAdvance();
				766	}
				767	literal.Complete();
				768
				769	next_.location.end_pos = source_pos() - 1;
				770	return true;
				771	}
				772
				773	// ----------------------------------------------------------------------------
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	774	// Keyword Matcher
				775
				776	KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
				777	{ "break", KEYWORD_PREFIX, Token::BREAK },
				778	{ NULL, C, Token::ILLEGAL },
				779	{ NULL, D, Token::ILLEGAL },
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	780	{ NULL, E, Token::ILLEGAL },
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	781	{ NULL, F, Token::ILLEGAL },
				782	{ NULL, UNMATCHABLE, Token::ILLEGAL },
				783	{ NULL, UNMATCHABLE, Token::ILLEGAL },
				784	{ NULL, I, Token::ILLEGAL },
				785	{ NULL, UNMATCHABLE, Token::ILLEGAL },
				786	{ NULL, UNMATCHABLE, Token::ILLEGAL },
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	787	{ "let", KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD },
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	788	{ NULL, UNMATCHABLE, Token::ILLEGAL },
				789	{ NULL, N, Token::ILLEGAL },
				790	{ NULL, UNMATCHABLE, Token::ILLEGAL },
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	791	{ NULL, P, Token::ILLEGAL },
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	792	{ NULL, UNMATCHABLE, Token::ILLEGAL },
				793	{ "return", KEYWORD_PREFIX, Token::RETURN },
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	794	{ NULL, S, Token::ILLEGAL },
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	795	{ NULL, T, Token::ILLEGAL },
				796	{ NULL, UNMATCHABLE, Token::ILLEGAL },
				797	{ NULL, V, Token::ILLEGAL },
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	798	{ NULL, W, Token::ILLEGAL },
				799	{ NULL, UNMATCHABLE, Token::ILLEGAL },
				800	{ "yield", KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD }
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	801	};
				802
				803
				804	void KeywordMatcher::Step(unibrow::uchar input) {
				805	switch (state_) {
				806	case INITIAL: {
				807	// matching the first character is the only state with significant fanout.
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	808	// Match only lower-case letters in range 'b'..'y'.
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	809	unsigned int offset = input - kFirstCharRangeMin;
				810	if (offset < kFirstCharRangeLength) {
				811	state_ = first_states_[offset].state;
				812	if (state_ == KEYWORD_PREFIX) {
				813	keyword_ = first_states_[offset].keyword;
				814	counter_ = 1;
				815	keyword_token_ = first_states_[offset].token;
				816	}
				817	return;
				818	}
				819	break;
				820	}
				821	case KEYWORD_PREFIX:
				822	if (static_cast<unibrow::uchar>(keyword_[counter_]) == input) {
				823	counter_++;
				824	if (keyword_[counter_] == '\0') {
				825	state_ = KEYWORD_MATCHED;
				826	token_ = keyword_token_;
				827	}
				828	return;
				829	}
				830	break;
				831	case KEYWORD_MATCHED:
				832	token_ = Token::IDENTIFIER;
				833	break;
				834	case C:
				835	if (MatchState(input, 'a', CA)) return;
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	836	if (MatchKeywordStart(input, "class", 1,
				837	Token::FUTURE_RESERVED_WORD)) return;
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	838	if (MatchState(input, 'o', CO)) return;
				839	break;
				840	case CA:
				841	if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
				842	if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
				843	break;
				844	case CO:
				845	if (MatchState(input, 'n', CON)) return;
				846	break;
				847	case CON:
				848	if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
				849	if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
				850	break;
				851	case D:
				852	if (MatchState(input, 'e', DE)) return;
				853	if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
				854	break;
				855	case DE:
				856	if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
				857	if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
				858	if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
				859	break;
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	860	case E:
				861	if (MatchKeywordStart(input, "else", 1, Token::ELSE)) return;
				862	if (MatchKeywordStart(input, "enum", 1,
				863	Token::FUTURE_RESERVED_WORD)) return;
				864	if (MatchState(input, 'x', EX)) return;
				865	break;
				866	case EX:
				867	if (MatchKeywordStart(input, "export", 2,
				868	Token::FUTURE_RESERVED_WORD)) return;
				869	if (MatchKeywordStart(input, "extends", 2,
				870	Token::FUTURE_RESERVED_WORD)) return;
				871	break;
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	872	case F:
				873	if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
				874	if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
				875	if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
				876	if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
				877	break;
				878	case I:
				879	if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	880	if (MatchState(input, 'm', IM)) return;
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	881	if (MatchKeyword(input, 'n', IN, Token::IN)) return;
				882	break;
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	883	case IM:
				884	if (MatchState(input, 'p', IMP)) return;
				885	break;
				886	case IMP:
				887	if (MatchKeywordStart(input, "implements", 3,
				888	Token::FUTURE_RESERVED_WORD )) return;
				889	if (MatchKeywordStart(input, "import", 3,
				890	Token::FUTURE_RESERVED_WORD)) return;
				891	break;
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	892	case IN:
				893	token_ = Token::IDENTIFIER;
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	894	if (MatchKeywordStart(input, "interface", 2,
				895	Token::FUTURE_RESERVED_WORD)) return;
vegorov@chromium.org	21b5e95	2010-11-23 10:24:40 +0000	[diff] [blame]	896	if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) return;
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	897	break;
				898	case N:
				899	if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
				900	if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
				901	if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
				902	break;
ricow@chromium.org	83aa549	2011-02-07 12:42:56 +0000	[diff] [blame]	903	case P:
				904	if (MatchKeywordStart(input, "package", 1,
				905	Token::FUTURE_RESERVED_WORD)) return;
				906	if (MatchState(input, 'r', PR)) return;
				907	if (MatchKeywordStart(input, "public", 1,
				908	Token::FUTURE_RESERVED_WORD)) return;
				909	break;
				910	case PR:
				911	if (MatchKeywordStart(input, "private", 2,
				912	Token::FUTURE_RESERVED_WORD)) return;
				913	if (MatchKeywordStart(input, "protected", 2,
				914	Token::FUTURE_RESERVED_WORD)) return;
				915	break;
				916	case S:
				917	if (MatchKeywordStart(input, "static", 1,
				918	Token::FUTURE_RESERVED_WORD)) return;
				919	if (MatchKeywordStart(input, "super", 1,
				920	Token::FUTURE_RESERVED_WORD)) return;
				921	if (MatchKeywordStart(input, "switch", 1,
				922	Token::SWITCH)) return;
				923	break;
whesse@chromium.org	f0ac72d	2010-11-08 12:47:26 +0000	[diff] [blame]	924	case T:
				925	if (MatchState(input, 'h', TH)) return;
				926	if (MatchState(input, 'r', TR)) return;
				927	if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
				928	break;
				929	case TH:
				930	if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
				931	if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
				932	break;
				933	case TR:
				934	if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
				935	if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
				936	break;
				937	case V:
				938	if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
				939	if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
				940	break;
				941	case W:
				942	if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
				943	if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
				944	break;
				945	case UNMATCHABLE:
				946	break;
				947	}
				948	// On fallthrough, it's a failure.
				949	state_ = UNMATCHABLE;
				950	}
				951
				952	} } // namespace v8::internal