Blame - src/scanner.h - fp2-dev/platform/external/v8

blob: f0035c0eb3e8b633f5c5761fe8eb41c5aa6f7e87 [file] [log] [blame]

Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	1	// Copyright 2006-2008 the V8 project authors. All rights reserved.
				2	// Redistribution and use in source and binary forms, with or without
				3	// modification, are permitted provided that the following conditions are
				4	// met:
				5	//
				6	// * Redistributions of source code must retain the above copyright
				7	// notice, this list of conditions and the following disclaimer.
				8	// * Redistributions in binary form must reproduce the above
				9	// copyright notice, this list of conditions and the following
				10	// disclaimer in the documentation and/or other materials provided
				11	// with the distribution.
				12	// * Neither the name of Google Inc. nor the names of its
				13	// contributors may be used to endorse or promote products derived
				14	// from this software without specific prior written permission.
				15	//
				16	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				19	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				20	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				22	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				23	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				24	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				25	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				26	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27
				28	#ifndef V8_SCANNER_H_
				29	#define V8_SCANNER_H_
				30
				31	#include "token.h"
				32	#include "char-predicates-inl.h"
				33
				34	namespace v8 {
				35	namespace internal {
				36
				37
				38	class UTF8Buffer {
				39	public:
				40	UTF8Buffer();
				41	~UTF8Buffer();
				42
				43	void AddChar(uc32 c) {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	44	ASSERT_NOT_NULL(data_);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	45	if (cursor_ <= limit_ &&
				46	static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
				47	*cursor_++ = static_cast<char>(c);
				48	} else {
				49	AddCharSlow(c);
				50	}
				51	}
				52
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	53	void Reset() {
				54	if (data_ == NULL) {
				55	data_ = NewArray<char>(kInitialCapacity);
				56	limit_ = ComputeLimit(data_, kInitialCapacity);
				57	}
				58	cursor_ = data_;
				59	}
				60
				61	int pos() const {
				62	ASSERT_NOT_NULL(data_);
				63	return static_cast<int>(cursor_ - data_);
				64	}
				65
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	66	char* data() const { return data_; }
				67
				68	private:
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	69	static const int kInitialCapacity = 256;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	70	char* data_;
				71	char* cursor_;
				72	char* limit_;
				73
				74	int Capacity() const {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	75	ASSERT_NOT_NULL(data_);
				76	return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	77	}
				78
				79	static char* ComputeLimit(char* data, int capacity) {
				80	return (data + capacity) - unibrow::Utf8::kMaxEncodedSize;
				81	}
				82
				83	void AddCharSlow(uc32 c);
				84	};
				85
				86
				87	class UTF16Buffer {
				88	public:
				89	UTF16Buffer();
				90	virtual ~UTF16Buffer() {}
				91
				92	virtual void PushBack(uc32 ch) = 0;
				93	// returns a value < 0 when the buffer end is reached
				94	virtual uc32 Advance() = 0;
				95	virtual void SeekForward(int pos) = 0;
				96
				97	int pos() const { return pos_; }
				98	int size() const { return size_; }
				99	Handle<String> SubString(int start, int end);
				100
				101	protected:
				102	Handle<String> data_;
				103	int pos_;
				104	int size_;
				105	};
				106
				107
				108	class CharacterStreamUTF16Buffer: public UTF16Buffer {
				109	public:
				110	CharacterStreamUTF16Buffer();
				111	virtual ~CharacterStreamUTF16Buffer() {}
				112	void Initialize(Handle<String> data, unibrow::CharacterStream* stream);
				113	virtual void PushBack(uc32 ch);
				114	virtual uc32 Advance();
				115	virtual void SeekForward(int pos);
				116
				117	private:
				118	List<uc32> pushback_buffer_;
				119	uc32 last_;
				120	unibrow::CharacterStream* stream_;
				121
				122	List<uc32>* pushback_buffer() { return &pushback_buffer_; }
				123	};
				124
				125
				126	class TwoByteStringUTF16Buffer: public UTF16Buffer {
				127	public:
				128	TwoByteStringUTF16Buffer();
				129	virtual ~TwoByteStringUTF16Buffer() {}
				130	void Initialize(Handle<ExternalTwoByteString> data);
				131	virtual void PushBack(uc32 ch);
				132	virtual uc32 Advance();
				133	virtual void SeekForward(int pos);
				134
				135	private:
				136	const uint16_t* raw_data_;
				137	};
				138
				139
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	140	class KeywordMatcher {
				141	// Incrementally recognize keywords.
				142	//
				143	// Recognized keywords:
				144	// break case catch const* continue debugger* default delete do else
				145	// finally false for function if in instanceof native* new null
				146	// return switch this throw true try typeof var void while with
				147	//
				148	// *: Actually "future reserved keywords". These are the only ones we
				149	// recognized, the remaining are allowed as identifiers.
				150	public:
				151	KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {}
				152
				153	Token::Value token() { return token_; }
				154
				155	inline void AddChar(uc32 input) {
				156	if (state_ != UNMATCHABLE) {
				157	Step(input);
				158	}
				159	}
				160
				161	void Fail() {
				162	token_ = Token::IDENTIFIER;
				163	state_ = UNMATCHABLE;
				164	}
				165
				166	private:
				167	enum State {
				168	UNMATCHABLE,
				169	INITIAL,
				170	KEYWORD_PREFIX,
				171	KEYWORD_MATCHED,
				172	C,
				173	CA,
				174	CO,
				175	CON,
				176	D,
				177	DE,
				178	F,
				179	I,
				180	IN,
				181	N,
				182	T,
				183	TH,
				184	TR,
				185	V,
				186	W
				187	};
				188
				189	struct FirstState {
				190	const char* keyword;
				191	State state;
				192	Token::Value token;
				193	};
				194
				195	// Range of possible first characters of a keyword.
				196	static const unsigned int kFirstCharRangeMin = 'b';
				197	static const unsigned int kFirstCharRangeMax = 'w';
				198	static const unsigned int kFirstCharRangeLength =
				199	kFirstCharRangeMax - kFirstCharRangeMin + 1;
				200	// State map for first keyword character range.
				201	static FirstState first_states_[kFirstCharRangeLength];
				202
				203	// Current state.
				204	State state_;
				205	// Token for currently added characters.
				206	Token::Value token_;
				207
				208	// Matching a specific keyword string (there is only one possible valid
				209	// keyword with the current prefix).
				210	const char* keyword_;
				211	int counter_;
				212	Token::Value keyword_token_;
				213
				214	// If input equals keyword's character at position, continue matching keyword
				215	// from that position.
				216	inline bool MatchKeywordStart(uc32 input,
				217	const char* keyword,
				218	int position,
				219	Token::Value token_if_match) {
				220	if (input == keyword[position]) {
				221	state_ = KEYWORD_PREFIX;
				222	this->keyword_ = keyword;
				223	this->counter_ = position + 1;
				224	this->keyword_token_ = token_if_match;
				225	return true;
				226	}
				227	return false;
				228	}
				229
				230	// If input equals match character, transition to new state and return true.
				231	inline bool MatchState(uc32 input, char match, State new_state) {
				232	if (input == match) {
				233	state_ = new_state;
				234	return true;
				235	}
				236	return false;
				237	}
				238
				239	inline bool MatchKeyword(uc32 input,
				240	char match,
				241	State new_state,
				242	Token::Value keyword_token) {
				243	if (input == match) { // Matched "do".
				244	state_ = new_state;
				245	token_ = keyword_token;
				246	return true;
				247	}
				248	return false;
				249	}
				250
				251	void Step(uc32 input);
				252	};
				253
				254
Leon Clarke	4515c47	2010-02-03 11:58:03 +0000	[diff] [blame^]	255	enum ParserMode { PARSE, PREPARSE };
				256	enum ParserLanguage { JAVASCRIPT, JSON };
				257
				258
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	259	class Scanner {
				260	public:
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	261	typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
				262
				263	// Construction
Leon Clarke	4515c47	2010-02-03 11:58:03 +0000	[diff] [blame^]	264	explicit Scanner(ParserMode parse_mode);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	265
				266	// Initialize the Scanner to scan source:
				267	void Init(Handle<String> source,
				268	unibrow::CharacterStream* stream,
Leon Clarke	4515c47	2010-02-03 11:58:03 +0000	[diff] [blame^]	269	int position,
				270	ParserLanguage language);
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	271
				272	// Returns the next token.
				273	Token::Value Next();
				274
				275	// One token look-ahead (past the token returned by Next()).
				276	Token::Value peek() const { return next_.token; }
				277
				278	// Returns true if there was a line terminator before the peek'ed token.
				279	bool has_line_terminator_before_next() const {
				280	return has_line_terminator_before_next_;
				281	}
				282
				283	struct Location {
				284	Location(int b, int e) : beg_pos(b), end_pos(e) { }
				285	Location() : beg_pos(0), end_pos(0) { }
				286	int beg_pos;
				287	int end_pos;
				288	};
				289
				290	// Returns the location information for the current token
				291	// (the token returned by Next()).
				292	Location location() const { return current_.location; }
				293	Location peek_location() const { return next_.location; }
				294
				295	// Returns the literal string, if any, for the current token (the
				296	// token returned by Next()). The string is 0-terminated and in
				297	// UTF-8 format; they may contain 0-characters. Literal strings are
				298	// collected for identifiers, strings, and numbers.
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	299	// These functions only give the correct result if the literal
				300	// was scanned between calls to StartLiteral() and TerminateLiteral().
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	301	const char* literal_string() const {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	302	return current_.literal_buffer->data();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	303	}
				304	int literal_length() const {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	305	// Excluding terminal '\0' added by TerminateLiteral().
				306	return current_.literal_buffer->pos() - 1;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	307	}
				308
				309	// Returns the literal string for the next token (the token that
				310	// would be returned if Next() were called).
				311	const char* next_literal_string() const {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	312	return next_.literal_buffer->data();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	313	}
				314	// Returns the length of the next token (that would be returned if
				315	// Next() were called).
				316	int next_literal_length() const {
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	317	return next_.literal_buffer->pos() - 1;
				318	}
				319
				320	Vector<const char> next_literal() const {
				321	return Vector<const char>(next_literal_string(),
				322	next_literal_length());
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	323	}
				324
				325	// Scans the input as a regular expression pattern, previous
				326	// character(s) must be /(=). Returns true if a pattern is scanned.
				327	bool ScanRegExpPattern(bool seen_equal);
				328	// Returns true if regexp flags are scanned (always since flags can
				329	// be empty).
				330	bool ScanRegExpFlags();
				331
				332	// Seek forward to the given position. This operation does not
				333	// work in general, for instance when there are pushed back
				334	// characters, but works for seeking forward until simple delimiter
				335	// tokens, which is what it is used for.
				336	void SeekForward(int pos);
				337
				338	Handle<String> SubString(int start_pos, int end_pos);
				339	bool stack_overflow() { return stack_overflow_; }
				340
				341	static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
				342
				343	// Tells whether the buffer contains an identifier (no escapes).
				344	// Used for checking if a property name is an identifier.
				345	static bool IsIdentifier(unibrow::CharacterStream* buffer);
				346
				347	static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
				348	static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
				349	static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
				350	static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
				351
				352	static const int kCharacterLookaheadBufferSize = 1;
				353
				354	private:
				355	CharacterStreamUTF16Buffer char_stream_buffer_;
				356	TwoByteStringUTF16Buffer two_byte_string_buffer_;
				357
				358	// Source.
				359	UTF16Buffer* source_;
				360	int position_;
				361
				362	// Buffer to hold literal values (identifiers, strings, numbers)
				363	// using 0-terminated UTF-8 encoding.
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	364	UTF8Buffer literal_buffer_1_;
				365	UTF8Buffer literal_buffer_2_;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	366
				367	bool stack_overflow_;
				368	static StaticResource<Utf8Decoder> utf8_decoder_;
				369
				370	// One Unicode character look-ahead; c0_ < 0 at the end of the input.
				371	uc32 c0_;
				372
				373	// The current and look-ahead token.
				374	struct TokenDesc {
				375	Token::Value token;
				376	Location location;
Steve Block	d0582a6	2009-12-15 09:54:21 +0000	[diff] [blame]	377	UTF8Buffer* literal_buffer;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	378	};
				379
				380	TokenDesc current_; // desc for current token (as returned by Next())
				381	TokenDesc next_; // desc for next token (one token look-ahead)
				382	bool has_line_terminator_before_next_;
				383	bool is_pre_parsing_;
Leon Clarke	4515c47	2010-02-03 11:58:03 +0000	[diff] [blame^]	384	bool is_parsing_json_;
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	385
				386	// Literal buffer support
				387	void StartLiteral();
				388	void AddChar(uc32 ch);
				389	void AddCharAdvance();
				390	void TerminateLiteral();
				391
				392	// Low-level scanning support.
				393	void Advance() { c0_ = source_->Advance(); }
				394	void PushBack(uc32 ch) {
				395	source_->PushBack(ch);
				396	c0_ = ch;
				397	}
				398
Leon Clarke	4515c47	2010-02-03 11:58:03 +0000	[diff] [blame^]	399	bool SkipWhiteSpace() {
				400	if (is_parsing_json_) {
				401	return SkipJsonWhiteSpace();
				402	} else {
				403	return SkipJavaScriptWhiteSpace();
				404	}
				405	}
				406	bool SkipJavaScriptWhiteSpace();
				407	bool SkipJsonWhiteSpace();
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	408	Token::Value SkipSingleLineComment();
				409	Token::Value SkipMultiLineComment();
				410
				411	inline Token::Value Select(Token::Value tok);
				412	inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
				413
Leon Clarke	4515c47	2010-02-03 11:58:03 +0000	[diff] [blame^]	414	inline void Scan() {
				415	if (is_parsing_json_) {
				416	ScanJson();
				417	} else {
				418	ScanJavaScript();
				419	}
				420	}
				421
				422	// Scans a single JavaScript token.
				423	void ScanJavaScript();
				424
				425	// Scan a single JSON token. The JSON lexical grammar is specified in the
				426	// ECMAScript 5 standard, section 15.12.1.1.
				427	// Recognizes all of the single-character tokens directly, or calls a function
				428	// to scan a number, string or identifier literal.
				429	// The only allowed whitespace characters between tokens are tab,
				430	// carrige-return, newline and space.
				431	void ScanJson();
				432
				433	// A JSON number (production JSONNumber) is a subset of the valid JavaScript
				434	// decimal number literals.
				435	// It includes an optional minus sign, must have at least one
				436	// digit before and after a decimal point, may not have prefixed zeros (unless
				437	// the integer part is zero), and may include an exponent part (e.g., "e-10").
				438	// Hexadecimal and octal numbers are not allowed.
				439	Token::Value ScanJsonNumber();
				440	// A JSON string (production JSONString) is subset of valid JavaScript string
				441	// literals. The string must only be double-quoted (not single-quoted), and
				442	// the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
				443	// four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
				444	Token::Value ScanJsonString();
				445	// Used to recognizes one of the literals "true", "false", or "null". These
				446	// are the only valid JSON identifiers (productions JSONBooleanLiteral,
				447	// JSONNullLiteral).
				448	Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
				449
Steve Block	a7e24c1	2009-10-30 11:49:00 +0000	[diff] [blame]	450	void ScanDecimalDigits();
				451	Token::Value ScanNumber(bool seen_period);
				452	Token::Value ScanIdentifier();
				453	uc32 ScanHexEscape(uc32 c, int length);
				454	uc32 ScanOctalEscape(uc32 c, int length);
				455	void ScanEscape();
				456	Token::Value ScanString();
				457
				458	// Scans a possible HTML comment -- begins with '<!'.
				459	Token::Value ScanHtmlComment();
				460
				461	// Return the current source position.
				462	int source_pos() {
				463	return source_->pos() - kCharacterLookaheadBufferSize + position_;
				464	}
				465
				466	// Decodes a unicode escape-sequence which is part of an identifier.
				467	// If the escape sequence cannot be decoded the result is kBadRune.
				468	uc32 ScanIdentifierUnicodeEscape();
				469	};
				470
				471	} } // namespace v8::internal
				472
				473	#endif // V8_SCANNER_H_