blob: 610091c52ec51fe52af9241dba844359a429585b [file] [log] [blame]
Ben Murdoch4a90d5f2016-03-22 12:00:34 +00001// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Features shared by parsing and pre-parsing scanners.
6
7#ifndef V8_PARSING_SCANNER_H_
8#define V8_PARSING_SCANNER_H_
9
10#include "src/allocation.h"
Ben Murdoch61f157c2016-09-16 13:49:30 +010011#include "src/base/hashmap.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000012#include "src/base/logging.h"
13#include "src/char-predicates.h"
Ben Murdochda12d292016-06-02 14:46:10 +010014#include "src/collector.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000015#include "src/globals.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000016#include "src/list.h"
Ben Murdochda12d292016-06-02 14:46:10 +010017#include "src/messages.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000018#include "src/parsing/token.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000019#include "src/unicode-decoder.h"
Ben Murdoch61f157c2016-09-16 13:49:30 +010020#include "src/unicode.h"
Ben Murdoch4a90d5f2016-03-22 12:00:34 +000021
22namespace v8 {
23namespace internal {
24
25
26class AstRawString;
27class AstValueFactory;
28class ParserRecorder;
29class UnicodeCache;
30
31
32// ---------------------------------------------------------------------
33// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
34// A code unit is a 16 bit value representing either a 16 bit code point
35// or one part of a surrogate pair that make a single 21 bit code point.
36
37class Utf16CharacterStream {
38 public:
39 Utf16CharacterStream() : pos_(0) { }
40 virtual ~Utf16CharacterStream() { }
41
42 // Returns and advances past the next UTF-16 code unit in the input
43 // stream. If there are no more code units, it returns a negative
44 // value.
45 inline uc32 Advance() {
46 if (buffer_cursor_ < buffer_end_ || ReadBlock()) {
47 pos_++;
48 return static_cast<uc32>(*(buffer_cursor_++));
49 }
50 // Note: currently the following increment is necessary to avoid a
51 // parser problem! The scanner treats the final kEndOfInput as
52 // a code unit with a position, and does math relative to that
53 // position.
54 pos_++;
55
56 return kEndOfInput;
57 }
58
59 // Return the current position in the code unit stream.
60 // Starts at zero.
61 inline size_t pos() const { return pos_; }
62
63 // Skips forward past the next code_unit_count UTF-16 code units
64 // in the input, or until the end of input if that comes sooner.
65 // Returns the number of code units actually skipped. If less
66 // than code_unit_count,
67 inline size_t SeekForward(size_t code_unit_count) {
68 size_t buffered_chars = buffer_end_ - buffer_cursor_;
69 if (code_unit_count <= buffered_chars) {
70 buffer_cursor_ += code_unit_count;
71 pos_ += code_unit_count;
72 return code_unit_count;
73 }
74 return SlowSeekForward(code_unit_count);
75 }
76
77 // Pushes back the most recently read UTF-16 code unit (or negative
78 // value if at end of input), i.e., the value returned by the most recent
79 // call to Advance.
80 // Must not be used right after calling SeekForward.
81 virtual void PushBack(int32_t code_unit) = 0;
82
83 virtual bool SetBookmark();
84 virtual void ResetToBookmark();
85
86 protected:
87 static const uc32 kEndOfInput = -1;
88
89 // Ensures that the buffer_cursor_ points to the code_unit at
90 // position pos_ of the input, if possible. If the position
91 // is at or after the end of the input, return false. If there
92 // are more code_units available, return true.
93 virtual bool ReadBlock() = 0;
94 virtual size_t SlowSeekForward(size_t code_unit_count) = 0;
95
96 const uint16_t* buffer_cursor_;
97 const uint16_t* buffer_end_;
98 size_t pos_;
99};
100
101
102// ---------------------------------------------------------------------
103// DuplicateFinder discovers duplicate symbols.
104
105class DuplicateFinder {
106 public:
107 explicit DuplicateFinder(UnicodeCache* constants)
108 : unicode_constants_(constants),
109 backing_store_(16),
110 map_(&Match) { }
111
112 int AddOneByteSymbol(Vector<const uint8_t> key, int value);
113 int AddTwoByteSymbol(Vector<const uint16_t> key, int value);
114 // Add a a number literal by converting it (if necessary)
115 // to the string that ToString(ToNumber(literal)) would generate.
116 // and then adding that string with AddOneByteSymbol.
117 // This string is the actual value used as key in an object literal,
118 // and the one that must be different from the other keys.
119 int AddNumber(Vector<const uint8_t> key, int value);
120
121 private:
122 int AddSymbol(Vector<const uint8_t> key, bool is_one_byte, int value);
123 // Backs up the key and its length in the backing store.
124 // The backup is stored with a base 127 encoding of the
125 // length (plus a bit saying whether the string is one byte),
126 // followed by the bytes of the key.
127 uint8_t* BackupKey(Vector<const uint8_t> key, bool is_one_byte);
128
129 // Compare two encoded keys (both pointing into the backing store)
130 // for having the same base-127 encoded lengths and representation.
131 // and then having the same 'length' bytes following.
132 static bool Match(void* first, void* second);
133 // Creates a hash from a sequence of bytes.
134 static uint32_t Hash(Vector<const uint8_t> key, bool is_one_byte);
135 // Checks whether a string containing a JS number is its canonical
136 // form.
137 static bool IsNumberCanonical(Vector<const uint8_t> key);
138
139 // Size of buffer. Sufficient for using it to call DoubleToCString in
140 // from conversions.h.
141 static const int kBufferSize = 100;
142
143 UnicodeCache* unicode_constants_;
144 // Backing store used to store strings used as hashmap keys.
145 SequenceCollector<unsigned char> backing_store_;
Ben Murdoch61f157c2016-09-16 13:49:30 +0100146 base::HashMap map_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000147 // Buffer used for string->number->canonical string conversions.
148 char number_buffer_[kBufferSize];
149};
150
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000151// ----------------------------------------------------------------------------
152// LiteralBuffer - Collector of chars of literals.
153
Ben Murdoch61f157c2016-09-16 13:49:30 +0100154const int kMaxAscii = 127;
155
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000156class LiteralBuffer {
157 public:
158 LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() { }
159
160 ~LiteralBuffer() { backing_store_.Dispose(); }
161
Ben Murdoch61f157c2016-09-16 13:49:30 +0100162 INLINE(void AddChar(char code_unit)) {
163 if (position_ >= backing_store_.length()) ExpandBuffer();
164 DCHECK(is_one_byte_);
165 DCHECK(0 <= code_unit && code_unit <= kMaxAscii);
166 backing_store_[position_] = static_cast<byte>(code_unit);
167 position_ += kOneByteSize;
168 return;
169 }
170
171 INLINE(void AddChar(uc32 code_unit)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000172 if (position_ >= backing_store_.length()) ExpandBuffer();
173 if (is_one_byte_) {
174 if (code_unit <= unibrow::Latin1::kMaxChar) {
175 backing_store_[position_] = static_cast<byte>(code_unit);
176 position_ += kOneByteSize;
177 return;
178 }
179 ConvertToTwoByte();
180 }
181 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
182 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
183 position_ += kUC16Size;
184 } else {
185 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
186 unibrow::Utf16::LeadSurrogate(code_unit);
187 position_ += kUC16Size;
188 if (position_ >= backing_store_.length()) ExpandBuffer();
189 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
190 unibrow::Utf16::TrailSurrogate(code_unit);
191 position_ += kUC16Size;
192 }
193 }
194
195 bool is_one_byte() const { return is_one_byte_; }
196
197 bool is_contextual_keyword(Vector<const char> keyword) const {
198 return is_one_byte() && keyword.length() == position_ &&
199 (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
200 }
201
202 Vector<const uint16_t> two_byte_literal() const {
203 DCHECK(!is_one_byte_);
204 DCHECK((position_ & 0x1) == 0);
205 return Vector<const uint16_t>(
206 reinterpret_cast<const uint16_t*>(backing_store_.start()),
207 position_ >> 1);
208 }
209
210 Vector<const uint8_t> one_byte_literal() const {
211 DCHECK(is_one_byte_);
212 return Vector<const uint8_t>(
213 reinterpret_cast<const uint8_t*>(backing_store_.start()),
214 position_);
215 }
216
217 int length() const {
218 return is_one_byte_ ? position_ : (position_ >> 1);
219 }
220
221 void ReduceLength(int delta) {
222 position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size);
223 }
224
225 void Reset() {
226 position_ = 0;
227 is_one_byte_ = true;
228 }
229
230 Handle<String> Internalize(Isolate* isolate) const;
231
232 void CopyFrom(const LiteralBuffer* other) {
233 if (other == nullptr) {
234 Reset();
235 } else {
236 is_one_byte_ = other->is_one_byte_;
237 position_ = other->position_;
Ben Murdochc5610432016-08-08 18:44:38 +0100238 if (position_ < backing_store_.length()) {
239 std::copy(other->backing_store_.begin(),
240 other->backing_store_.begin() + position_,
241 backing_store_.begin());
242 } else {
243 backing_store_.Dispose();
244 backing_store_ = other->backing_store_.Clone();
245 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000246 }
247 }
248
249 private:
250 static const int kInitialCapacity = 16;
251 static const int kGrowthFactory = 4;
252 static const int kMinConversionSlack = 256;
253 static const int kMaxGrowth = 1 * MB;
254 inline int NewCapacity(int min_capacity) {
255 int capacity = Max(min_capacity, backing_store_.length());
256 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
257 return new_capacity;
258 }
259
260 void ExpandBuffer() {
261 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
262 MemCopy(new_store.start(), backing_store_.start(), position_);
263 backing_store_.Dispose();
264 backing_store_ = new_store;
265 }
266
267 void ConvertToTwoByte() {
268 DCHECK(is_one_byte_);
269 Vector<byte> new_store;
270 int new_content_size = position_ * kUC16Size;
271 if (new_content_size >= backing_store_.length()) {
272 // Ensure room for all currently read code units as UC16 as well
273 // as the code unit about to be stored.
274 new_store = Vector<byte>::New(NewCapacity(new_content_size));
275 } else {
276 new_store = backing_store_;
277 }
278 uint8_t* src = backing_store_.start();
279 uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
280 for (int i = position_ - 1; i >= 0; i--) {
281 dst[i] = src[i];
282 }
283 if (new_store.start() != backing_store_.start()) {
284 backing_store_.Dispose();
285 backing_store_ = new_store;
286 }
287 position_ = new_content_size;
288 is_one_byte_ = false;
289 }
290
291 bool is_one_byte_;
292 int position_;
293 Vector<byte> backing_store_;
294
295 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
296};
297
298
299// ----------------------------------------------------------------------------
300// JavaScript Scanner.
301
302class Scanner {
303 public:
304 // Scoped helper for literal recording. Automatically drops the literal
305 // if aborting the scanning before it's complete.
306 class LiteralScope {
307 public:
308 explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) {
309 scanner_->StartLiteral();
310 }
311 ~LiteralScope() {
312 if (!complete_) scanner_->DropLiteral();
313 }
314 void Complete() {
315 complete_ = true;
316 }
317
318 private:
319 Scanner* scanner_;
320 bool complete_;
321 };
322
323 // Scoped helper for a re-settable bookmark.
324 class BookmarkScope {
325 public:
326 explicit BookmarkScope(Scanner* scanner) : scanner_(scanner) {
327 DCHECK_NOT_NULL(scanner_);
328 }
329 ~BookmarkScope() { scanner_->DropBookmark(); }
330
331 bool Set() { return scanner_->SetBookmark(); }
332 void Reset() { scanner_->ResetToBookmark(); }
333 bool HasBeenSet() { return scanner_->BookmarkHasBeenSet(); }
334 bool HasBeenReset() { return scanner_->BookmarkHasBeenReset(); }
335
336 private:
337 Scanner* scanner_;
338
339 DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
340 };
341
342 // Representation of an interval of source positions.
343 struct Location {
344 Location(int b, int e) : beg_pos(b), end_pos(e) { }
345 Location() : beg_pos(0), end_pos(0) { }
346
347 bool IsValid() const {
348 return beg_pos >= 0 && end_pos >= beg_pos;
349 }
350
351 static Location invalid() { return Location(-1, -1); }
352
353 int beg_pos;
354 int end_pos;
355 };
356
357 // -1 is outside of the range of any real source code.
358 static const int kNoOctalLocation = -1;
359
360 explicit Scanner(UnicodeCache* scanner_contants);
361
362 void Initialize(Utf16CharacterStream* source);
363
364 // Returns the next token and advances input.
365 Token::Value Next();
366 // Returns the token following peek()
367 Token::Value PeekAhead();
368 // Returns the current token again.
369 Token::Value current_token() { return current_.token; }
370 // Returns the location information for the current token
371 // (the token last returned by Next()).
372 Location location() const { return current_.location; }
373
Ben Murdochda12d292016-06-02 14:46:10 +0100374 bool has_error() const { return scanner_error_ != MessageTemplate::kNone; }
375 MessageTemplate::Template error() const { return scanner_error_; }
376 Location error_location() const { return scanner_error_location_; }
377
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000378 // Similar functions for the upcoming token.
379
380 // One token look-ahead (past the token returned by Next()).
381 Token::Value peek() const { return next_.token; }
382
383 Location peek_location() const { return next_.location; }
384
385 bool literal_contains_escapes() const {
386 return LiteralContainsEscapes(current_);
387 }
388 bool next_literal_contains_escapes() const {
389 return LiteralContainsEscapes(next_);
390 }
391 bool is_literal_contextual_keyword(Vector<const char> keyword) {
392 DCHECK_NOT_NULL(current_.literal_chars);
393 return current_.literal_chars->is_contextual_keyword(keyword);
394 }
395 bool is_next_contextual_keyword(Vector<const char> keyword) {
396 DCHECK_NOT_NULL(next_.literal_chars);
397 return next_.literal_chars->is_contextual_keyword(keyword);
398 }
399
400 const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory);
401 const AstRawString* NextSymbol(AstValueFactory* ast_value_factory);
402 const AstRawString* CurrentRawSymbol(AstValueFactory* ast_value_factory);
403
404 double DoubleValue();
405 bool ContainsDot();
406 bool LiteralMatches(const char* data, int length, bool allow_escapes = true) {
407 if (is_literal_one_byte() &&
408 literal_length() == length &&
409 (allow_escapes || !literal_contains_escapes())) {
410 const char* token =
411 reinterpret_cast<const char*>(literal_one_byte_string().start());
412 return !strncmp(token, data, length);
413 }
414 return false;
415 }
416 inline bool UnescapedLiteralMatches(const char* data, int length) {
417 return LiteralMatches(data, length, false);
418 }
419
420 void IsGetOrSet(bool* is_get, bool* is_set) {
421 if (is_literal_one_byte() &&
422 literal_length() == 3 &&
423 !literal_contains_escapes()) {
424 const char* token =
425 reinterpret_cast<const char*>(literal_one_byte_string().start());
426 *is_get = strncmp(token, "get", 3) == 0;
427 *is_set = !*is_get && strncmp(token, "set", 3) == 0;
428 }
429 }
430
431 int FindSymbol(DuplicateFinder* finder, int value);
432
433 UnicodeCache* unicode_cache() { return unicode_cache_; }
434
435 // Returns the location of the last seen octal literal.
436 Location octal_position() const { return octal_pos_; }
437 void clear_octal_position() { octal_pos_ = Location::invalid(); }
Ben Murdochc5610432016-08-08 18:44:38 +0100438 // Returns the location of the last seen decimal literal with a leading zero.
439 Location decimal_with_leading_zero_position() const {
440 return decimal_with_leading_zero_pos_;
441 }
442 void clear_decimal_with_leading_zero_position() {
443 decimal_with_leading_zero_pos_ = Location::invalid();
444 }
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000445
446 // Returns the value of the last smi that was scanned.
447 int smi_value() const { return current_.smi_value_; }
448
449 // Seek forward to the given position. This operation does not
450 // work in general, for instance when there are pushed back
451 // characters, but works for seeking forward until simple delimiter
452 // tokens, which is what it is used for.
453 void SeekForward(int pos);
454
455 // Returns true if there was a line terminator before the peek'ed token,
456 // possibly inside a multi-line comment.
457 bool HasAnyLineTerminatorBeforeNext() const {
458 return has_line_terminator_before_next_ ||
459 has_multiline_comment_before_next_;
460 }
461
Ben Murdochc5610432016-08-08 18:44:38 +0100462 bool HasAnyLineTerminatorAfterNext() {
463 Token::Value ensure_next_next = PeekAhead();
464 USE(ensure_next_next);
465 return has_line_terminator_after_next_;
466 }
467
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000468 // Scans the input as a regular expression pattern, previous
469 // character(s) must be /(=). Returns true if a pattern is scanned.
470 bool ScanRegExpPattern(bool seen_equal);
471 // Scans the input as regular expression flags. Returns the flags on success.
472 Maybe<RegExp::Flags> ScanRegExpFlags();
473
474 // Scans the input as a template literal
475 Token::Value ScanTemplateStart();
476 Token::Value ScanTemplateContinuation();
477
478 const LiteralBuffer* source_url() const { return &source_url_; }
479 const LiteralBuffer* source_mapping_url() const {
480 return &source_mapping_url_;
481 }
482
483 bool IdentifierIsFutureStrictReserved(const AstRawString* string) const;
484
Ben Murdoch097c5b22016-05-18 11:27:45 +0100485 bool FoundHtmlComment() const { return found_html_comment_; }
486
Ben Murdochda12d292016-06-02 14:46:10 +0100487#define DECLARE_ACCESSORS(name) \
488 inline bool allow_##name() const { return allow_##name##_; } \
489 inline void set_allow_##name(bool allow) { allow_##name##_ = allow; }
490 DECLARE_ACCESSORS(harmony_exponentiation_operator)
491#undef ACCESSOR
492
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000493 private:
494 // The current and look-ahead token.
495 struct TokenDesc {
496 Token::Value token;
497 Location location;
498 LiteralBuffer* literal_chars;
499 LiteralBuffer* raw_literal_chars;
500 int smi_value_;
501 };
502
503 static const int kCharacterLookaheadBufferSize = 1;
504
505 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
506 template <bool capture_raw>
507 uc32 ScanOctalEscape(uc32 c, int length);
508
509 // Call this after setting source_ to the input.
510 void Init() {
511 // Set c0_ (one character ahead)
512 STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
513 Advance();
514 // Initialize current_ to not refer to a literal.
515 current_.literal_chars = NULL;
516 current_.raw_literal_chars = NULL;
517 next_next_.token = Token::UNINITIALIZED;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100518 found_html_comment_ = false;
Ben Murdochda12d292016-06-02 14:46:10 +0100519 scanner_error_ = MessageTemplate::kNone;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000520 }
521
522 // Support BookmarkScope functionality.
523 bool SetBookmark();
524 void ResetToBookmark();
525 bool BookmarkHasBeenSet();
526 bool BookmarkHasBeenReset();
527 void DropBookmark();
528 static void CopyTokenDesc(TokenDesc* to, TokenDesc* from);
529
Ben Murdochda12d292016-06-02 14:46:10 +0100530 void ReportScannerError(const Location& location,
531 MessageTemplate::Template error) {
532 if (has_error()) return;
533 scanner_error_ = error;
534 scanner_error_location_ = location;
535 }
536
537 void ReportScannerError(int pos, MessageTemplate::Template error) {
538 if (has_error()) return;
539 scanner_error_ = error;
540 scanner_error_location_ = Location(pos, pos + 1);
541 }
542
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000543 // Literal buffer support
544 inline void StartLiteral() {
545 LiteralBuffer* free_buffer =
546 (current_.literal_chars == &literal_buffer0_)
547 ? &literal_buffer1_
548 : (current_.literal_chars == &literal_buffer1_) ? &literal_buffer2_
549 : &literal_buffer0_;
550 free_buffer->Reset();
551 next_.literal_chars = free_buffer;
552 }
553
554 inline void StartRawLiteral() {
555 LiteralBuffer* free_buffer =
556 (current_.raw_literal_chars == &raw_literal_buffer0_)
557 ? &raw_literal_buffer1_
558 : (current_.raw_literal_chars == &raw_literal_buffer1_)
559 ? &raw_literal_buffer2_
560 : &raw_literal_buffer0_;
561 free_buffer->Reset();
562 next_.raw_literal_chars = free_buffer;
563 }
564
565 INLINE(void AddLiteralChar(uc32 c)) {
566 DCHECK_NOT_NULL(next_.literal_chars);
567 next_.literal_chars->AddChar(c);
568 }
569
Ben Murdoch61f157c2016-09-16 13:49:30 +0100570 INLINE(void AddLiteralChar(char c)) {
571 DCHECK_NOT_NULL(next_.literal_chars);
572 next_.literal_chars->AddChar(c);
573 }
574
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000575 INLINE(void AddRawLiteralChar(uc32 c)) {
576 DCHECK_NOT_NULL(next_.raw_literal_chars);
577 next_.raw_literal_chars->AddChar(c);
578 }
579
580 INLINE(void ReduceRawLiteralLength(int delta)) {
581 DCHECK_NOT_NULL(next_.raw_literal_chars);
582 next_.raw_literal_chars->ReduceLength(delta);
583 }
584
585 // Stops scanning of a literal and drop the collected characters,
586 // e.g., due to an encountered error.
587 inline void DropLiteral() {
588 next_.literal_chars = NULL;
589 next_.raw_literal_chars = NULL;
590 }
591
592 inline void AddLiteralCharAdvance() {
593 AddLiteralChar(c0_);
594 Advance();
595 }
596
597 // Low-level scanning support.
598 template <bool capture_raw = false, bool check_surrogate = true>
599 void Advance() {
600 if (capture_raw) {
601 AddRawLiteralChar(c0_);
602 }
603 c0_ = source_->Advance();
604 if (check_surrogate) HandleLeadSurrogate();
605 }
606
607 void HandleLeadSurrogate() {
608 if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
609 uc32 c1 = source_->Advance();
610 if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
611 source_->PushBack(c1);
612 } else {
613 c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
614 }
615 }
616 }
617
618 void PushBack(uc32 ch) {
Ben Murdochc5610432016-08-08 18:44:38 +0100619 if (c0_ > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000620 source_->PushBack(unibrow::Utf16::TrailSurrogate(c0_));
621 source_->PushBack(unibrow::Utf16::LeadSurrogate(c0_));
622 } else {
623 source_->PushBack(c0_);
624 }
625 c0_ = ch;
626 }
627
628 inline Token::Value Select(Token::Value tok) {
629 Advance();
630 return tok;
631 }
632
633 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
634 Advance();
635 if (c0_ == next) {
636 Advance();
637 return then;
638 } else {
639 return else_;
640 }
641 }
642
643 // Returns the literal string, if any, for the current token (the
644 // token last returned by Next()). The string is 0-terminated.
645 // Literal strings are collected for identifiers, strings, numbers as well
646 // as for template literals. For template literals we also collect the raw
647 // form.
648 // These functions only give the correct result if the literal was scanned
649 // when a LiteralScope object is alive.
650 Vector<const uint8_t> literal_one_byte_string() {
651 DCHECK_NOT_NULL(current_.literal_chars);
652 return current_.literal_chars->one_byte_literal();
653 }
654 Vector<const uint16_t> literal_two_byte_string() {
655 DCHECK_NOT_NULL(current_.literal_chars);
656 return current_.literal_chars->two_byte_literal();
657 }
658 bool is_literal_one_byte() {
659 DCHECK_NOT_NULL(current_.literal_chars);
660 return current_.literal_chars->is_one_byte();
661 }
662 int literal_length() const {
663 DCHECK_NOT_NULL(current_.literal_chars);
664 return current_.literal_chars->length();
665 }
666 // Returns the literal string for the next token (the token that
667 // would be returned if Next() were called).
668 Vector<const uint8_t> next_literal_one_byte_string() {
669 DCHECK_NOT_NULL(next_.literal_chars);
670 return next_.literal_chars->one_byte_literal();
671 }
672 Vector<const uint16_t> next_literal_two_byte_string() {
673 DCHECK_NOT_NULL(next_.literal_chars);
674 return next_.literal_chars->two_byte_literal();
675 }
676 bool is_next_literal_one_byte() {
677 DCHECK_NOT_NULL(next_.literal_chars);
678 return next_.literal_chars->is_one_byte();
679 }
680 Vector<const uint8_t> raw_literal_one_byte_string() {
681 DCHECK_NOT_NULL(current_.raw_literal_chars);
682 return current_.raw_literal_chars->one_byte_literal();
683 }
684 Vector<const uint16_t> raw_literal_two_byte_string() {
685 DCHECK_NOT_NULL(current_.raw_literal_chars);
686 return current_.raw_literal_chars->two_byte_literal();
687 }
688 bool is_raw_literal_one_byte() {
689 DCHECK_NOT_NULL(current_.raw_literal_chars);
690 return current_.raw_literal_chars->is_one_byte();
691 }
692
Ben Murdochda12d292016-06-02 14:46:10 +0100693 template <bool capture_raw, bool unicode = false>
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000694 uc32 ScanHexNumber(int expected_length);
695 // Scan a number of any length but not bigger than max_value. For example, the
696 // number can be 000000001, so it's very long in characters but its value is
697 // small.
698 template <bool capture_raw>
Ben Murdochda12d292016-06-02 14:46:10 +0100699 uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos);
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000700
701 // Scans a single JavaScript token.
702 void Scan();
703
704 bool SkipWhiteSpace();
705 Token::Value SkipSingleLineComment();
706 Token::Value SkipSourceURLComment();
707 void TryToParseSourceURLComment();
708 Token::Value SkipMultiLineComment();
709 // Scans a possible HTML comment -- begins with '<!'.
710 Token::Value ScanHtmlComment();
711
712 void ScanDecimalDigits();
713 Token::Value ScanNumber(bool seen_period);
714 Token::Value ScanIdentifierOrKeyword();
715 Token::Value ScanIdentifierSuffix(LiteralScope* literal, bool escaped);
716
717 Token::Value ScanString();
718
719 // Scans an escape-sequence which is part of a string and adds the
720 // decoded character to the current literal. Returns true if a pattern
721 // is scanned.
722 template <bool capture_raw, bool in_template_literal>
723 bool ScanEscape();
724
725 // Decodes a Unicode escape-sequence which is part of an identifier.
726 // If the escape sequence cannot be decoded the result is kBadChar.
727 uc32 ScanIdentifierUnicodeEscape();
728 // Helper for the above functions.
729 template <bool capture_raw>
730 uc32 ScanUnicodeEscape();
731
732 Token::Value ScanTemplateSpan();
733
734 // Return the current source position.
735 int source_pos() {
736 return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
737 }
738
739 static bool LiteralContainsEscapes(const TokenDesc& token) {
740 Location location = token.location;
741 int source_length = (location.end_pos - location.beg_pos);
742 if (token.token == Token::STRING) {
743 // Subtract delimiters.
744 source_length -= 2;
745 }
746 return token.literal_chars->length() != source_length;
747 }
748
749 UnicodeCache* unicode_cache_;
750
751 // Buffers collecting literal strings, numbers, etc.
752 LiteralBuffer literal_buffer0_;
753 LiteralBuffer literal_buffer1_;
754 LiteralBuffer literal_buffer2_;
755
756 // Values parsed from magic comments.
757 LiteralBuffer source_url_;
758 LiteralBuffer source_mapping_url_;
759
760 // Buffer to store raw string values
761 LiteralBuffer raw_literal_buffer0_;
762 LiteralBuffer raw_literal_buffer1_;
763 LiteralBuffer raw_literal_buffer2_;
764
765 TokenDesc current_; // desc for current token (as returned by Next())
766 TokenDesc next_; // desc for next token (one token look-ahead)
767 TokenDesc next_next_; // desc for the token after next (after PeakAhead())
768
769 // Variables for Scanner::BookmarkScope and the *Bookmark implementation.
770 // These variables contain the scanner state when a bookmark is set.
771 //
772 // We will use bookmark_c0_ as a 'control' variable, where:
773 // - bookmark_c0_ >= 0: A bookmark has been set and this contains c0_.
774 // - bookmark_c0_ == -1: No bookmark has been set.
775 // - bookmark_c0_ == -2: The bookmark has been applied (ResetToBookmark).
776 //
777 // Which state is being bookmarked? The parser state is distributed over
778 // several variables, roughly like this:
779 // ... 1234 + 5678 ..... [character stream]
780 // [current_] [next_] c0_ | [scanner state]
781 // So when the scanner is logically at the beginning of an expression
782 // like "1234 + 4567", then:
783 // - current_ contains "1234"
784 // - next_ contains "+"
785 // - c0_ contains ' ' (the space between "+" and "5678",
786 // - the source_ character stream points to the beginning of "5678".
787 // To be able to restore this state, we will keep copies of current_, next_,
788 // and c0_; we'll ask the stream to bookmark itself, and we'll copy the
789 // contents of current_'s and next_'s literal buffers to bookmark_*_literal_.
790 static const uc32 kNoBookmark = -1;
791 static const uc32 kBookmarkWasApplied = -2;
792 uc32 bookmark_c0_;
793 TokenDesc bookmark_current_;
794 TokenDesc bookmark_next_;
795 LiteralBuffer bookmark_current_literal_;
796 LiteralBuffer bookmark_current_raw_literal_;
797 LiteralBuffer bookmark_next_literal_;
798 LiteralBuffer bookmark_next_raw_literal_;
799
800 // Input stream. Must be initialized to an Utf16CharacterStream.
801 Utf16CharacterStream* source_;
802
Ben Murdochc5610432016-08-08 18:44:38 +0100803 // Last-seen positions of potentially problematic tokens.
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000804 Location octal_pos_;
Ben Murdochc5610432016-08-08 18:44:38 +0100805 Location decimal_with_leading_zero_pos_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000806
807 // One Unicode character look-ahead; c0_ < 0 at the end of the input.
808 uc32 c0_;
809
810 // Whether there is a line terminator whitespace character after
811 // the current token, and before the next. Does not count newlines
812 // inside multiline comments.
813 bool has_line_terminator_before_next_;
814 // Whether there is a multi-line comment that contains a
815 // line-terminator after the current token, and before the next.
816 bool has_multiline_comment_before_next_;
Ben Murdochc5610432016-08-08 18:44:38 +0100817 bool has_line_terminator_after_next_;
Ben Murdoch097c5b22016-05-18 11:27:45 +0100818
819 // Whether this scanner encountered an HTML comment.
820 bool found_html_comment_;
Ben Murdochda12d292016-06-02 14:46:10 +0100821
822 bool allow_harmony_exponentiation_operator_;
823
824 MessageTemplate::Template scanner_error_;
825 Location scanner_error_location_;
Ben Murdoch4a90d5f2016-03-22 12:00:34 +0000826};
827
828} // namespace internal
829} // namespace v8
830
831#endif // V8_PARSING_SCANNER_H_