blob: bdf899b54df4fb53b699a2f5780e672b03f86bda [file] [log] [blame]
Kristian Monsen0d5e1162010-09-30 15:31:59 +01001// Copyright 2010 the V8 project authors. All rights reserved.
Steve Blocka7e24c12009-10-30 11:49:00 +00002// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6// * Redistributions of source code must retain the above copyright
7// notice, this list of conditions and the following disclaimer.
8// * Redistributions in binary form must reproduce the above
9// copyright notice, this list of conditions and the following
10// disclaimer in the documentation and/or other materials provided
11// with the distribution.
12// * Neither the name of Google Inc. nor the names of its
13// contributors may be used to endorse or promote products derived
14// from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_SCANNER_H_
29#define V8_SCANNER_H_
30
31#include "token.h"
32#include "char-predicates-inl.h"
Teng-Hui Zhu3e5fa292010-11-09 16:16:48 -080033#include "scanner-base.h"
Steve Blocka7e24c12009-10-30 11:49:00 +000034
35namespace v8 {
36namespace internal {
37
Ben Murdochb0fe1622011-05-05 13:52:32 +010038// A buffered character stream based on a random access character
39// source (ReadBlock can be called with pos_ pointing to any position,
40// even positions before the current).
41class BufferedUC16CharacterStream: public UC16CharacterStream {
Steve Blocka7e24c12009-10-30 11:49:00 +000042 public:
Ben Murdochb0fe1622011-05-05 13:52:32 +010043 BufferedUC16CharacterStream();
44 virtual ~BufferedUC16CharacterStream();
Steve Blocka7e24c12009-10-30 11:49:00 +000045
Ben Murdochb0fe1622011-05-05 13:52:32 +010046 virtual void PushBack(uc16 character);
Steve Blocka7e24c12009-10-30 11:49:00 +000047
Ben Murdochb0fe1622011-05-05 13:52:32 +010048 protected:
49 static const unsigned kBufferSize = 512;
50 static const unsigned kPushBackStepSize = 16;
51
52 virtual unsigned SlowSeekForward(unsigned delta);
53 virtual bool ReadBlock();
54 virtual void SlowPushBack(uc16 character);
55
56 virtual unsigned BufferSeekForward(unsigned delta) = 0;
57 virtual unsigned FillBuffer(unsigned position, unsigned length) = 0;
58
59 const uc16* pushback_limit_;
60 uc16 buffer_[kBufferSize];
61};
62
63
64// Generic string stream.
65class GenericStringUC16CharacterStream: public BufferedUC16CharacterStream {
66 public:
67 GenericStringUC16CharacterStream(Handle<String> data,
68 unsigned start_position,
69 unsigned end_position);
70 virtual ~GenericStringUC16CharacterStream();
71
72 protected:
73 virtual unsigned BufferSeekForward(unsigned delta);
74 virtual unsigned FillBuffer(unsigned position, unsigned length);
75
76 Handle<String> string_;
77 unsigned start_position_;
78 unsigned length_;
79};
80
81
82// UC16 stream based on a literal UTF-8 string.
83class Utf8ToUC16CharacterStream: public BufferedUC16CharacterStream {
84 public:
85 Utf8ToUC16CharacterStream(const byte* data, unsigned length);
86 virtual ~Utf8ToUC16CharacterStream();
87
88 protected:
89 virtual unsigned BufferSeekForward(unsigned delta);
90 virtual unsigned FillBuffer(unsigned char_position, unsigned length);
91 void SetRawPosition(unsigned char_position);
92
93 const byte* raw_data_;
94 unsigned raw_data_length_; // Measured in bytes, not characters.
95 unsigned raw_data_pos_;
96 // The character position of the character at raw_data[raw_data_pos_].
97 // Not necessarily the same as pos_.
98 unsigned raw_character_position_;
Steve Blocka7e24c12009-10-30 11:49:00 +000099};
100
101
Steve Block6ded16b2010-05-10 14:33:55 +0100102// UTF16 buffer to read characters from an external string.
Ben Murdochb0fe1622011-05-05 13:52:32 +0100103class ExternalTwoByteStringUC16CharacterStream: public UC16CharacterStream {
Steve Blocka7e24c12009-10-30 11:49:00 +0000104 public:
Ben Murdochb0fe1622011-05-05 13:52:32 +0100105 ExternalTwoByteStringUC16CharacterStream(Handle<ExternalTwoByteString> data,
106 int start_position,
107 int end_position);
108 virtual ~ExternalTwoByteStringUC16CharacterStream();
Steve Blocka7e24c12009-10-30 11:49:00 +0000109
Ben Murdochb0fe1622011-05-05 13:52:32 +0100110 virtual void PushBack(uc16 character) {
111 ASSERT(buffer_cursor_ > raw_data_);
112 buffer_cursor_--;
113 pos_--;
114 }
115 protected:
116 virtual unsigned SlowSeekForward(unsigned delta) {
117 // Fast case always handles seeking.
118 return 0;
119 }
120 virtual bool ReadBlock() {
121 // Entire string is read at start.
122 return false;
123 }
124 Handle<ExternalTwoByteString> source_;
125 const uc16* raw_data_; // Pointer to the actual array of characters.
Steve Blocka7e24c12009-10-30 11:49:00 +0000126};
127
128
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800129// ----------------------------------------------------------------------------
130// V8JavaScriptScanner
131// JavaScript scanner getting its input from either a V8 String or a unicode
132// CharacterStream.
Kristian Monsen80d68ea2010-09-08 11:05:35 +0100133
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800134class V8JavaScriptScanner : public JavaScriptScanner {
135 public:
Ben Murdochb0fe1622011-05-05 13:52:32 +0100136 V8JavaScriptScanner();
Steve Block9fac8402011-05-12 15:51:54 +0100137 void Initialize(UC16CharacterStream* source);
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800138};
139
140
141class JsonScanner : public Scanner {
142 public:
143 JsonScanner();
144
Ben Murdochb0fe1622011-05-05 13:52:32 +0100145 void Initialize(UC16CharacterStream* source);
Steve Blocka7e24c12009-10-30 11:49:00 +0000146
147 // Returns the next token.
148 Token::Value Next();
149
Shimeng (Simon) Wang8a31eba2010-12-06 19:01:33 -0800150 protected:
151 // Skip past JSON whitespace (only space, tab, newline and carrige-return).
Leon Clarke4515c472010-02-03 11:58:03 +0000152 bool SkipJsonWhiteSpace();
Leon Clarke4515c472010-02-03 11:58:03 +0000153
154 // Scan a single JSON token. The JSON lexical grammar is specified in the
155 // ECMAScript 5 standard, section 15.12.1.1.
156 // Recognizes all of the single-character tokens directly, or calls a function
157 // to scan a number, string or identifier literal.
158 // The only allowed whitespace characters between tokens are tab,
Ben Murdochb0fe1622011-05-05 13:52:32 +0100159 // carriage-return, newline and space.
Leon Clarke4515c472010-02-03 11:58:03 +0000160 void ScanJson();
161
162 // A JSON number (production JSONNumber) is a subset of the valid JavaScript
163 // decimal number literals.
164 // It includes an optional minus sign, must have at least one
165 // digit before and after a decimal point, may not have prefixed zeros (unless
166 // the integer part is zero), and may include an exponent part (e.g., "e-10").
167 // Hexadecimal and octal numbers are not allowed.
168 Token::Value ScanJsonNumber();
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100169
Leon Clarke4515c472010-02-03 11:58:03 +0000170 // A JSON string (production JSONString) is subset of valid JavaScript string
171 // literals. The string must only be double-quoted (not single-quoted), and
172 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
173 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
174 Token::Value ScanJsonString();
Kristian Monsen9dcf7e22010-06-28 14:14:28 +0100175
Leon Clarke4515c472010-02-03 11:58:03 +0000176 // Used to recognizes one of the literals "true", "false", or "null". These
177 // are the only valid JSON identifiers (productions JSONBooleanLiteral,
178 // JSONNullLiteral).
179 Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
Steve Blocka7e24c12009-10-30 11:49:00 +0000180};
181
Steve Blocka7e24c12009-10-30 11:49:00 +0000182} } // namespace v8::internal
183
184#endif // V8_SCANNER_H_