Blame - src/scanner.cc - fp2-dev/platform/external/chromium_org/v8

blob: 24a6d4be9cbf79ea11d6f7d65d5a5decdf45146a [file] [log] [blame]

ager@chromium.org	9258b6b	2008-09-11 09:11:10 +0000	[diff] [blame]	1	// Copyright 2006-2008 the V8 project authors. All rights reserved.
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#include "v8.h"
				29
				30	#include "ast.h"
				31	#include "scanner.h"
				32
kasperl@chromium.org	71affb5	2009-05-26 05:44:31 +0000	[diff] [blame]	33	namespace v8 {
				34	namespace internal {
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	35
				36	// ----------------------------------------------------------------------------
				37	// Character predicates
				38
				39
				40	unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
				41	unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
				42	unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
				43	unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
				44
				45
				46	StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
				47
				48
				49	// ----------------------------------------------------------------------------
				50	// UTF8Buffer
				51
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	52	UTF8Buffer::UTF8Buffer() {
				53	static const int kInitialCapacity = 1 * KB;
				54	data_ = NewArray<char>(kInitialCapacity);
				55	limit_ = ComputeLimit(data_, kInitialCapacity);
				56	Reset();
				57	ASSERT(Capacity() == kInitialCapacity && pos() == 0);
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	58	}
				59
				60
				61	UTF8Buffer::~UTF8Buffer() {
				62	DeleteArray(data_);
				63	}
				64
				65
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	66	void UTF8Buffer::AddCharSlow(uc32 c) {
				67	static const int kCapacityGrowthLimit = 1 * MB;
				68	if (cursor_ > limit_) {
				69	int old_capacity = Capacity();
				70	int old_position = pos();
				71	int new_capacity =
				72	Min(old_capacity * 2, old_capacity + kCapacityGrowthLimit);
				73	char* new_data = NewArray<char>(new_capacity);
				74	memcpy(new_data, data_, old_position);
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	75	DeleteArray(data_);
				76	data_ = new_data;
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	77	cursor_ = new_data + old_position;
				78	limit_ = ComputeLimit(new_data, new_capacity);
				79	ASSERT(Capacity() == new_capacity && pos() == old_position);
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	80	}
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	81	if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
				82	*cursor_++ = c; // Common case: 7-bit ASCII.
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	83	} else {
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	84	cursor_ += unibrow::Utf8::Encode(cursor_, c);
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	85	}
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	86	ASSERT(pos() <= Capacity());
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	87	}
				88
				89
				90	// ----------------------------------------------------------------------------
				91	// UTF16Buffer
				92
				93
				94	UTF16Buffer::UTF16Buffer()
				95	: pos_(0),
				96	pushback_buffer_(0),
				97	last_(0),
				98	stream_(NULL) { }
				99
				100
				101	void UTF16Buffer::Initialize(Handle<String> data,
				102	unibrow::CharacterStream* input) {
				103	data_ = data;
				104	pos_ = 0;
				105	stream_ = input;
				106	}
				107
				108
				109	Handle<String> UTF16Buffer::SubString(int start, int end) {
				110	return internal::SubString(data_, start, end);
				111	}
				112
				113
				114	void UTF16Buffer::PushBack(uc32 ch) {
				115	pushback_buffer()->Add(last_);
				116	last_ = ch;
				117	pos_--;
				118	}
				119
				120
				121	uc32 UTF16Buffer::Advance() {
				122	// NOTE: It is of importance to Persian / Farsi resources that we do
				123	// not strip format control characters in the scanner; see
				124	//
				125	// https://bugzilla.mozilla.org/show_bug.cgi?id=274152
				126	//
				127	// So, even though ECMA-262, section 7.1, page 11, dictates that we
				128	// must remove Unicode format-control characters, we do not. This is
				129	// in line with how IE and SpiderMonkey handles it.
				130	if (!pushback_buffer()->is_empty()) {
				131	pos_++;
				132	return last_ = pushback_buffer()->RemoveLast();
				133	} else if (stream_->has_more()) {
				134	pos_++;
				135	uc32 next = stream_->GetNext();
				136	return last_ = next;
				137	} else {
				138	// note: currently the following increment is necessary to avoid a
				139	// test-parser problem!
				140	pos_++;
				141	return last_ = static_cast<uc32>(-1);
				142	}
				143	}
				144
				145
				146	void UTF16Buffer::SeekForward(int pos) {
				147	pos_ = pos;
				148	ASSERT(pushback_buffer()->is_empty());
				149	stream_->Seek(pos);
				150	}
				151
				152
				153	// ----------------------------------------------------------------------------
				154	// Scanner
				155
				156	Scanner::Scanner(bool pre) : stack_overflow_(false), is_pre_parsing_(pre) {
				157	Token::Initialize();
				158	}
				159
				160
				161	void Scanner::Init(Handle<String> source, unibrow::CharacterStream* stream,
				162	int position) {
				163	// Initialize the source buffer.
				164	source_.Initialize(source, stream);
				165	position_ = position;
				166
				167	// Reset literals buffer
				168	literals_.Reset();
				169
				170	// Set c0_ (one character ahead)
				171	ASSERT(kCharacterLookaheadBufferSize == 1);
				172	Advance();
				173
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	174	// Skip initial whitespace allowing HTML comment ends just like
				175	// after a newline and scan first token.
				176	has_line_terminator_before_next_ = true;
				177	SkipWhiteSpace();
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	178	Scan();
				179	}
				180
				181
				182	Handle<String> Scanner::SubString(int start, int end) {
				183	return source_.SubString(start - position_, end - position_);
				184	}
				185
				186
				187	Token::Value Scanner::Next() {
				188	// BUG 1215673: Find a thread safe way to set a stack limit in
				189	// pre-parse mode. Otherwise, we cannot safely pre-parse from other
				190	// threads.
				191	current_ = next_;
				192	// Check for stack-overflow before returning any tokens.
				193	StackLimitCheck check;
				194	if (check.HasOverflowed()) {
				195	stack_overflow_ = true;
				196	next_.token = Token::ILLEGAL;
				197	} else {
				198	Scan();
				199	}
				200	return current_.token;
				201	}
				202
				203
				204	void Scanner::StartLiteral() {
				205	next_.literal_pos = literals_.pos();
				206	}
				207
				208
				209	void Scanner::AddChar(uc32 c) {
				210	literals_.AddChar(c);
				211	}
				212
				213
				214	void Scanner::TerminateLiteral() {
				215	next_.literal_end = literals_.pos();
				216	AddChar(0);
				217	}
				218
				219
				220	void Scanner::AddCharAdvance() {
				221	AddChar(c0_);
				222	Advance();
				223	}
				224
				225
				226	void Scanner::Advance() {
				227	c0_ = source_.Advance();
				228	}
				229
				230
				231	void Scanner::PushBack(uc32 ch) {
				232	source_.PushBack(ch);
				233	c0_ = ch;
				234	}
				235
				236
ager@chromium.org	3bf7b91	2008-11-17 09:09:45 +0000	[diff] [blame]	237	static inline bool IsByteOrderMark(uc32 c) {
				238	// The Unicode value U+FFFE is guaranteed never to be assigned as a
				239	// Unicode character; this implies that in a Unicode context the
				240	// 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
				241	// character expressed in little-endian byte order (since it could
				242	// not be a U+FFFE character expressed in big-endian byte
				243	// order). Nevertheless, we check for it to be compatible with
				244	// Spidermonkey.
				245	return c == 0xFEFF \|\| c == 0xFFFE;
				246	}
				247
				248
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	249	bool Scanner::SkipWhiteSpace() {
				250	int start_position = source_pos();
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	251
				252	while (true) {
ager@chromium.org	3bf7b91	2008-11-17 09:09:45 +0000	[diff] [blame]	253	// We treat byte-order marks (BOMs) as whitespace for better
				254	// compatibility with Spidermonkey and other JavaScript engines.
				255	while (kIsWhiteSpace.get(c0_) \|\| IsByteOrderMark(c0_)) {
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	256	// IsWhiteSpace() includes line terminators!
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	257	if (kIsLineTerminator.get(c0_)) {
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	258	// Ignore line terminators, but remember them. This is necessary
				259	// for automatic semicolon insertion.
				260	has_line_terminator_before_next_ = true;
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	261	}
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	262	Advance();
				263	}
				264
				265	// If there is an HTML comment end '-->' at the beginning of a
				266	// line (with only whitespace in front of it), we treat the rest
				267	// of the line as a comment. This is in line with the way
				268	// SpiderMonkey handles it.
				269	if (c0_ == '-' && has_line_terminator_before_next_) {
				270	Advance();
				271	if (c0_ == '-') {
				272	Advance();
				273	if (c0_ == '>') {
				274	// Treat the rest of the line as a comment.
				275	SkipSingleLineComment();
				276	// Continue skipping white space after the comment.
				277	continue;
				278	}
				279	PushBack('-'); // undo Advance()
				280	}
				281	PushBack('-'); // undo Advance()
				282	}
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	283	// Return whether or not we skipped any characters.
				284	return source_pos() != start_position;
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	285	}
				286	}
				287
				288
				289	Token::Value Scanner::SkipSingleLineComment() {
				290	Advance();
				291
				292	// The line terminator at the end of the line is not considered
				293	// to be part of the single-line comment; it is recognized
				294	// separately by the lexical grammar and becomes part of the
				295	// stream of input elements for the syntactic grammar (see
				296	// ECMA-262, section 7.4, page 12).
				297	while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
				298	Advance();
				299	}
				300
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	301	return Token::WHITESPACE;
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	302	}
				303
				304
				305	Token::Value Scanner::SkipMultiLineComment() {
				306	ASSERT(c0_ == '*');
				307	Advance();
				308
				309	while (c0_ >= 0) {
				310	char ch = c0_;
				311	Advance();
				312	// If we have reached the end of the multi-line comment, we
				313	// consume the '/' and insert a whitespace. This way all
				314	// multi-line comments are treated as whitespace - even the ones
				315	// containing line terminators. This contradicts ECMA-262, section
				316	// 7.4, page 12, that says that multi-line comments containing
				317	// line terminators should be treated as a line terminator, but it
				318	// matches the behaviour of SpiderMonkey and KJS.
				319	if (ch == '*' && c0_ == '/') {
				320	c0_ = ' ';
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	321	return Token::WHITESPACE;
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	322	}
				323	}
				324
				325	// Unterminated multi-line comment.
				326	return Token::ILLEGAL;
				327	}
				328
				329
				330	Token::Value Scanner::ScanHtmlComment() {
				331	// Check for <!-- comments.
				332	ASSERT(c0_ == '!');
				333	Advance();
				334	if (c0_ == '-') {
				335	Advance();
				336	if (c0_ == '-') return SkipSingleLineComment();
				337	PushBack('-'); // undo Advance()
				338	}
				339	PushBack('!'); // undo Advance()
				340	ASSERT(c0_ == '!');
				341	return Token::LT;
				342	}
				343
				344
				345	void Scanner::Scan() {
				346	Token::Value token;
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	347	has_line_terminator_before_next_ = false;
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	348	do {
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	349	// Remember the position of the next token
				350	next_.location.beg_pos = source_pos();
				351
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	352	switch (c0_) {
				353	case ' ':
				354	case '\t':
				355	Advance();
				356	token = Token::WHITESPACE;
				357	break;
				358
				359	case '\n':
				360	Advance();
				361	has_line_terminator_before_next_ = true;
				362	token = Token::WHITESPACE;
				363	break;
				364
				365	case '"': case '\'':
				366	token = ScanString();
				367	break;
				368
				369	case '<':
				370	// < <= << <<= <!--
				371	Advance();
				372	if (c0_ == '=') {
				373	token = Select(Token::LTE);
				374	} else if (c0_ == '<') {
				375	token = Select('=', Token::ASSIGN_SHL, Token::SHL);
				376	} else if (c0_ == '!') {
				377	token = ScanHtmlComment();
				378	} else {
				379	token = Token::LT;
				380	}
				381	break;
				382
				383	case '>':
				384	// > >= >> >>= >>> >>>=
				385	Advance();
				386	if (c0_ == '=') {
				387	token = Select(Token::GTE);
				388	} else if (c0_ == '>') {
				389	// >> >>= >>> >>>=
				390	Advance();
				391	if (c0_ == '=') {
				392	token = Select(Token::ASSIGN_SAR);
				393	} else if (c0_ == '>') {
				394	token = Select('=', Token::ASSIGN_SHR, Token::SHR);
				395	} else {
				396	token = Token::SAR;
				397	}
				398	} else {
				399	token = Token::GT;
				400	}
				401	break;
				402
				403	case '=':
				404	// = == ===
				405	Advance();
				406	if (c0_ == '=') {
				407	token = Select('=', Token::EQ_STRICT, Token::EQ);
				408	} else {
				409	token = Token::ASSIGN;
				410	}
				411	break;
				412
				413	case '!':
				414	// ! != !==
				415	Advance();
				416	if (c0_ == '=') {
				417	token = Select('=', Token::NE_STRICT, Token::NE);
				418	} else {
				419	token = Token::NOT;
				420	}
				421	break;
				422
				423	case '+':
				424	// + ++ +=
				425	Advance();
				426	if (c0_ == '+') {
				427	token = Select(Token::INC);
				428	} else if (c0_ == '=') {
				429	token = Select(Token::ASSIGN_ADD);
				430	} else {
				431	token = Token::ADD;
				432	}
				433	break;
				434
				435	case '-':
				436	// - -- --> -=
				437	Advance();
				438	if (c0_ == '-') {
				439	Advance();
				440	if (c0_ == '>' && has_line_terminator_before_next_) {
				441	// For compatibility with SpiderMonkey, we skip lines that
				442	// start with an HTML comment end '-->'.
				443	token = SkipSingleLineComment();
				444	} else {
				445	token = Token::DEC;
				446	}
				447	} else if (c0_ == '=') {
				448	token = Select(Token::ASSIGN_SUB);
				449	} else {
				450	token = Token::SUB;
				451	}
				452	break;
				453
				454	case '*':
				455	// * *=
				456	token = Select('=', Token::ASSIGN_MUL, Token::MUL);
				457	break;
				458
				459	case '%':
				460	// % %=
				461	token = Select('=', Token::ASSIGN_MOD, Token::MOD);
				462	break;
				463
				464	case '/':
				465	// / // /* /=
				466	Advance();
				467	if (c0_ == '/') {
				468	token = SkipSingleLineComment();
				469	} else if (c0_ == '*') {
				470	token = SkipMultiLineComment();
				471	} else if (c0_ == '=') {
				472	token = Select(Token::ASSIGN_DIV);
				473	} else {
				474	token = Token::DIV;
				475	}
				476	break;
				477
				478	case '&':
				479	// & && &=
				480	Advance();
				481	if (c0_ == '&') {
				482	token = Select(Token::AND);
				483	} else if (c0_ == '=') {
				484	token = Select(Token::ASSIGN_BIT_AND);
				485	} else {
				486	token = Token::BIT_AND;
				487	}
				488	break;
				489
				490	case '\|':
				491	// \| \|\| \|=
				492	Advance();
				493	if (c0_ == '\|') {
				494	token = Select(Token::OR);
				495	} else if (c0_ == '=') {
				496	token = Select(Token::ASSIGN_BIT_OR);
				497	} else {
				498	token = Token::BIT_OR;
				499	}
				500	break;
				501
				502	case '^':
				503	// ^ ^=
				504	token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
				505	break;
				506
				507	case '.':
				508	// . Number
				509	Advance();
				510	if (IsDecimalDigit(c0_)) {
				511	token = ScanNumber(true);
				512	} else {
				513	token = Token::PERIOD;
				514	}
				515	break;
				516
				517	case ':':
				518	token = Select(Token::COLON);
				519	break;
				520
				521	case ';':
				522	token = Select(Token::SEMICOLON);
				523	break;
				524
				525	case ',':
				526	token = Select(Token::COMMA);
				527	break;
				528
				529	case '(':
				530	token = Select(Token::LPAREN);
				531	break;
				532
				533	case ')':
				534	token = Select(Token::RPAREN);
				535	break;
				536
				537	case '[':
				538	token = Select(Token::LBRACK);
				539	break;
				540
				541	case ']':
				542	token = Select(Token::RBRACK);
				543	break;
				544
				545	case '{':
				546	token = Select(Token::LBRACE);
				547	break;
				548
				549	case '}':
				550	token = Select(Token::RBRACE);
				551	break;
				552
				553	case '?':
				554	token = Select(Token::CONDITIONAL);
				555	break;
				556
				557	case '~':
				558	token = Select(Token::BIT_NOT);
				559	break;
				560
				561	default:
				562	if (kIsIdentifierStart.get(c0_)) {
				563	token = ScanIdentifier();
				564	} else if (IsDecimalDigit(c0_)) {
				565	token = ScanNumber(false);
				566	} else if (SkipWhiteSpace()) {
				567	token = Token::WHITESPACE;
				568	} else if (c0_ < 0) {
				569	token = Token::EOS;
				570	} else {
				571	token = Select(Token::ILLEGAL);
				572	}
				573	break;
				574	}
				575
				576	// Continue scanning for tokens as long as we're just skipping
				577	// whitespace.
				578	} while (token == Token::WHITESPACE);
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	579
				580	next_.location.end_pos = source_pos();
				581	next_.token = token;
				582	}
				583
				584
				585	void Scanner::SeekForward(int pos) {
				586	source_.SeekForward(pos - 1);
				587	Advance();
				588	Scan();
				589	}
				590
				591
				592	uc32 Scanner::ScanHexEscape(uc32 c, int length) {
				593	ASSERT(length <= 4); // prevent overflow
				594
				595	uc32 digits[4];
				596	uc32 x = 0;
				597	for (int i = 0; i < length; i++) {
				598	digits[i] = c0_;
				599	int d = HexValue(c0_);
				600	if (d < 0) {
				601	// According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
				602	// should be illegal, but other JS VMs just return the
				603	// non-escaped version of the original character.
				604
				605	// Push back digits read, except the last one (in c0_).
				606	for (int j = i-1; j >= 0; j--) {
				607	PushBack(digits[j]);
				608	}
ager@chromium.org	6f10e41	2009-02-13 10:11:16 +0000	[diff] [blame]	609	// Notice: No handling of error - treat it as "\u"->"u".
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	610	return c;
				611	}
				612	x = x * 16 + d;
				613	Advance();
				614	}
				615
				616	return x;
				617	}
				618
				619
				620	// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
				621	// ECMA-262. Other JS VMs support them.
				622	uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
				623	uc32 x = c - '0';
				624	for (int i = 0; i < length; i++) {
				625	int d = c0_ - '0';
				626	if (d < 0 \|\| d > 7) break;
				627	int nx = x * 8 + d;
				628	if (nx >= 256) break;
				629	x = nx;
				630	Advance();
				631	}
				632	return x;
				633	}
				634
				635
				636	void Scanner::ScanEscape() {
				637	uc32 c = c0_;
				638	Advance();
				639
				640	// Skip escaped newlines.
				641	if (kIsLineTerminator.get(c)) {
				642	// Allow CR+LF newlines in multiline string literals.
				643	if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
				644	// Allow LF+CR newlines in multiline string literals.
				645	if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
				646	return;
				647	}
				648
				649	switch (c) {
				650	case '\'': // fall through
				651	case '"' : // fall through
				652	case '\\': break;
				653	case 'b' : c = '\b'; break;
				654	case 'f' : c = '\f'; break;
				655	case 'n' : c = '\n'; break;
				656	case 'r' : c = '\r'; break;
				657	case 't' : c = '\t'; break;
				658	case 'u' : c = ScanHexEscape(c, 4); break;
				659	case 'v' : c = '\v'; break;
				660	case 'x' : c = ScanHexEscape(c, 2); break;
				661	case '0' : // fall through
				662	case '1' : // fall through
				663	case '2' : // fall through
				664	case '3' : // fall through
				665	case '4' : // fall through
				666	case '5' : // fall through
				667	case '6' : // fall through
				668	case '7' : c = ScanOctalEscape(c, 2); break;
				669	}
				670
				671	// According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
				672	// should be illegal, but they are commonly handled
				673	// as non-escaped characters by JS VMs.
				674	AddChar(c);
				675	}
				676
				677
				678	Token::Value Scanner::ScanString() {
				679	uc32 quote = c0_;
				680	Advance(); // consume quote
				681
				682	StartLiteral();
				683	while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
				684	uc32 c = c0_;
				685	Advance();
				686	if (c == '\\') {
				687	if (c0_ < 0) return Token::ILLEGAL;
				688	ScanEscape();
				689	} else {
				690	AddChar(c);
				691	}
				692	}
				693	if (c0_ != quote) {
				694	return Token::ILLEGAL;
				695	}
				696	TerminateLiteral();
				697
				698	Advance(); // consume quote
				699	return Token::STRING;
				700	}
				701
				702
				703	Token::Value Scanner::Select(Token::Value tok) {
				704	Advance();
				705	return tok;
				706	}
				707
				708
				709	Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
				710	Advance();
				711	if (c0_ == next) {
				712	Advance();
				713	return then;
				714	} else {
				715	return else_;
				716	}
				717	}
				718
				719
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	720	// Returns true if any decimal digits were scanned, returns false otherwise.
				721	void Scanner::ScanDecimalDigits() {
				722	while (IsDecimalDigit(c0_))
				723	AddCharAdvance();
				724	}
				725
				726
				727	Token::Value Scanner::ScanNumber(bool seen_period) {
				728	ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction
				729
				730	enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
				731
				732	StartLiteral();
				733	if (seen_period) {
				734	// we have already seen a decimal point of the float
				735	AddChar('.');
				736	ScanDecimalDigits(); // we know we have at least one digit
				737
				738	} else {
				739	// if the first character is '0' we must check for octals and hex
				740	if (c0_ == '0') {
				741	AddCharAdvance();
				742
				743	// either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
				744	if (c0_ == 'x' \|\| c0_ == 'X') {
				745	// hex number
				746	kind = HEX;
				747	AddCharAdvance();
				748	if (!IsHexDigit(c0_))
				749	// we must have at least one hex digit after 'x'/'X'
				750	return Token::ILLEGAL;
				751	while (IsHexDigit(c0_))
				752	AddCharAdvance();
				753
				754	} else if ('0' <= c0_ && c0_ <= '7') {
				755	// (possible) octal number
				756	kind = OCTAL;
				757	while (true) {
				758	if (c0_ == '8' \|\| c0_ == '9') {
				759	kind = DECIMAL;
				760	break;
				761	}
				762	if (c0_ < '0' \|\| '7' < c0_) break;
				763	AddCharAdvance();
				764	}
				765	}
				766	}
				767
				768	// Parse decimal digits and allow trailing fractional part.
				769	if (kind == DECIMAL) {
				770	ScanDecimalDigits(); // optional
				771	if (c0_ == '.') {
				772	AddCharAdvance();
				773	ScanDecimalDigits(); // optional
				774	}
				775	}
				776	}
				777
				778	// scan exponent, if any
				779	if (c0_ == 'e' \|\| c0_ == 'E') {
				780	ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number
				781	if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed
				782	// scan exponent
				783	AddCharAdvance();
				784	if (c0_ == '+' \|\| c0_ == '-')
				785	AddCharAdvance();
				786	if (!IsDecimalDigit(c0_))
				787	// we must have at least one decimal digit after 'e'/'E'
				788	return Token::ILLEGAL;
				789	ScanDecimalDigits();
				790	}
				791	TerminateLiteral();
				792
				793	// The source character immediately following a numeric literal must
				794	// not be an identifier start or a decimal digit; see ECMA-262
				795	// section 7.8.3, page 17 (note that we read only one decimal digit
				796	// if the value is 0).
				797	if (IsDecimalDigit(c0_) \|\| kIsIdentifierStart.get(c0_))
				798	return Token::ILLEGAL;
				799
				800	return Token::NUMBER;
				801	}
				802
				803
				804	uc32 Scanner::ScanIdentifierUnicodeEscape() {
				805	Advance();
				806	if (c0_ != 'u') return unibrow::Utf8::kBadChar;
				807	Advance();
				808	uc32 c = ScanHexEscape('u', 4);
				809	// We do not allow a unicode escape sequence to start another
				810	// unicode escape sequence.
				811	if (c == '\\') return unibrow::Utf8::kBadChar;
				812	return c;
				813	}
				814
				815
				816	Token::Value Scanner::ScanIdentifier() {
				817	ASSERT(kIsIdentifierStart.get(c0_));
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	818	bool has_escapes = false;
				819
				820	StartLiteral();
				821	// Scan identifier start character.
				822	if (c0_ == '\\') {
				823	has_escapes = true;
				824	uc32 c = ScanIdentifierUnicodeEscape();
				825	// Only allow legal identifier start characters.
				826	if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
				827	AddChar(c);
				828	} else {
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	829	AddChar(c0_);
				830	Advance();
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	831	}
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	832
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	833	// Scan the rest of the identifier characters.
				834	while (kIsIdentifierPart.get(c0_)) {
				835	if (c0_ == '\\') {
				836	has_escapes = true;
				837	uc32 c = ScanIdentifierUnicodeEscape();
				838	// Only allow legal identifier part characters.
				839	if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
				840	AddChar(c);
				841	} else {
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	842	AddChar(c0_);
				843	Advance();
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	844	}
				845	}
				846	TerminateLiteral();
				847
				848	// We don't have any 1-letter keywords (this is probably a common case).
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	849	if ((next_.literal_end - next_.literal_pos) == 1) {
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	850	return Token::IDENTIFIER;
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	851	}
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	852
				853	// If the identifier contains unicode escapes, it must not be
				854	// resolved to a keyword.
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	855	if (has_escapes) {
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	856	return Token::IDENTIFIER;
kasperl@chromium.org	b3284ad	2009-05-18 06:12:45 +0000	[diff] [blame]	857	}
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	858
				859	return Token::Lookup(&literals_.data()[next_.literal_pos]);
				860	}
				861
				862
				863
				864	bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
ager@chromium.org	3291210	2009-01-16 10:38:43 +0000	[diff] [blame]	865	// Checks whether the buffer contains an identifier (no escape).
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	866	if (!buffer->has_more()) return false;
				867	if (!kIsIdentifierStart.get(buffer->GetNext())) return false;
				868	while (buffer->has_more()) {
				869	if (!kIsIdentifierPart.get(buffer->GetNext())) return false;
				870	}
				871	return true;
				872	}
				873
				874
				875	bool Scanner::ScanRegExpPattern(bool seen_equal) {
				876	// Scan: ('/' \| '/=') RegularExpressionBody '/' RegularExpressionFlags
				877	bool in_character_class = false;
				878
				879	// Previous token is either '/' or '/=', in the second case, the
				880	// pattern starts at =.
				881	next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
				882	next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
				883
				884	// Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
				885	// the scanner should pass uninterpreted bodies to the RegExp
				886	// constructor.
				887	StartLiteral();
				888	if (seen_equal)
				889	AddChar('=');
				890
				891	while (c0_ != '/' \|\| in_character_class) {
				892	if (kIsLineTerminator.get(c0_) \|\| c0_ < 0)
				893	return false;
				894	if (c0_ == '\\') { // escaped character
				895	AddCharAdvance();
				896	if (kIsLineTerminator.get(c0_) \|\| c0_ < 0)
				897	return false;
				898	AddCharAdvance();
				899	} else { // unescaped character
				900	if (c0_ == '[')
				901	in_character_class = true;
				902	if (c0_ == ']')
				903	in_character_class = false;
				904	AddCharAdvance();
				905	}
				906	}
				907	Advance(); // consume '/'
				908
				909	TerminateLiteral();
				910
				911	return true;
				912	}
				913
				914	bool Scanner::ScanRegExpFlags() {
				915	// Scan regular expression flags.
				916	StartLiteral();
ager@chromium.org	6f10e41	2009-02-13 10:11:16 +0000	[diff] [blame]	917	while (kIsIdentifierPart.get(c0_)) {
				918	if (c0_ == '\\') {
				919	uc32 c = ScanIdentifierUnicodeEscape();
				920	if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
				921	// We allow any escaped character, unlike the restriction on
				922	// IdentifierPart when it is used to build an IdentifierName.
				923	AddChar(c);
				924	continue;
				925	}
				926	}
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	927	AddCharAdvance();
ager@chromium.org	6f10e41	2009-02-13 10:11:16 +0000	[diff] [blame]	928	}
christian.plesner.hansen	43d26ec	2008-07-03 15:10:15 +0000	[diff] [blame]	929	TerminateLiteral();
				930
				931	next_.location.end_pos = source_pos() - 1;
				932	return true;
				933	}
				934
				935	} } // namespace v8::internal