blob: b3550dfbafc6384b9735b105984ffcf288ae07b1 [file] [log] [blame]
temporal40ee5512008-07-10 02:12:20 +00001// Protocol Buffers - Google's data interchange format
kenton@google.com24bf56f2008-09-24 20:31:01 +00002// Copyright 2008 Google Inc. All rights reserved.
Feng Xiaoe4288622014-10-01 16:26:23 -07003// https://developers.google.com/protocol-buffers/
temporal40ee5512008-07-10 02:12:20 +00004//
kenton@google.com24bf56f2008-09-24 20:31:01 +00005// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
temporal40ee5512008-07-10 02:12:20 +00008//
kenton@google.com24bf56f2008-09-24 20:31:01 +00009// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
temporal40ee5512008-07-10 02:12:20 +000018//
kenton@google.com24bf56f2008-09-24 20:31:01 +000019// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
temporal40ee5512008-07-10 02:12:20 +000030
31// Author: kenton@google.com (Kenton Varda)
32// Based on original Protocol Buffers design by
33// Sanjay Ghemawat, Jeff Dean, and others.
34//
35// Here we have a hand-written lexer. At first you might ask yourself,
36// "Hand-written text processing? Is Kenton crazy?!" Well, first of all,
37// yes I am crazy, but that's beside the point. There are actually reasons
38// why I ended up writing this this way.
39//
40// The traditional approach to lexing is to use lex to generate a lexer for
41// you. Unfortunately, lex's output is ridiculously ugly and difficult to
42// integrate cleanly with C++ code, especially abstract code or code meant
43// as a library. Better parser-generators exist but would add dependencies
44// which most users won't already have, which we'd like to avoid. (GNU flex
45// has a C++ output option, but it's still ridiculously ugly, non-abstract,
46// and not library-friendly.)
47//
48// The next approach that any good software engineer should look at is to
49// use regular expressions. And, indeed, I did. I have code which
50// implements this same class using regular expressions. It's about 200
51// lines shorter. However:
52// - Rather than error messages telling you "This string has an invalid
53// escape sequence at line 5, column 45", you get error messages like
54// "Parse error on line 5". Giving more precise errors requires adding
55// a lot of code that ends up basically as complex as the hand-coded
56// version anyway.
57// - The regular expression to match a string literal looks like this:
58// kString = new RE("(\"([^\"\\\\]|" // non-escaped
59// "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape
60// "\\\\x[0-9a-fA-F])*\"|" // hex escape
61// "\'([^\'\\\\]|" // Also support single-quotes.
62// "\\\\[abfnrtv?\"'\\\\0-7]|"
63// "\\\\x[0-9a-fA-F])*\')");
64// Verifying the correctness of this line noise is actually harder than
65// verifying the correctness of ConsumeString(), defined below. I'm not
66// even confident that the above is correct, after staring at it for some
67// time.
68// - PCRE is fast, but there's still more overhead involved than the code
69// below.
70// - Sadly, regular expressions are not part of the C standard library, so
71// using them would require depending on some other library. For the
72// open source release, this could be really annoying. Nobody likes
73// downloading one piece of software just to find that they need to
74// download something else to make it work, and in all likelihood
75// people downloading Protocol Buffers will already be doing so just
76// to make something else work. We could include a copy of PCRE with
77// our code, but that obligates us to keep it up-to-date and just seems
78// like a big waste just to save 200 lines of code.
79//
80// On a similar but unrelated note, I'm even scared to use ctype.h.
81// Apparently functions like isalpha() are locale-dependent. So, if we used
82// that, then if this code is being called from some program that doesn't
83// have its locale set to "C", it would behave strangely. We can't just set
84// the locale to "C" ourselves since we might break the calling program that
85// way, particularly if it is multi-threaded. WTF? Someone please let me
86// (Kenton) know if I'm missing something here...
87//
88// I'd love to hear about other alternatives, though, as this code isn't
89// exactly pretty.
90
91#include <google/protobuf/io/tokenizer.h>
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +000092#include <google/protobuf/stubs/common.h>
Feng Xiaoeee38b02015-08-22 18:25:48 -070093#include <google/protobuf/stubs/logging.h>
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +000094#include <google/protobuf/stubs/stringprintf.h>
jieluo@google.com4de8f552014-07-18 00:47:59 +000095#include <google/protobuf/io/strtod.h>
temporal40ee5512008-07-10 02:12:20 +000096#include <google/protobuf/io/zero_copy_stream.h>
97#include <google/protobuf/stubs/strutil.h>
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +000098#include <google/protobuf/stubs/stl_util.h>
temporal40ee5512008-07-10 02:12:20 +000099
100namespace google {
101namespace protobuf {
102namespace io {
103namespace {
104
105// As mentioned above, I don't trust ctype.h due to the presence of "locales".
106// So, I have written replacement functions here. Someone please smack me if
107// this is a bad idea or if there is some way around this.
108//
109// These "character classes" are designed to be used in template methods.
110// For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
111// whitespace.
112
113// Note: No class is allowed to contain '\0', since this is used to mark end-
114// of-input and is handled specially.
115
116#define CHARACTER_CLASS(NAME, EXPRESSION) \
117 class NAME { \
118 public: \
119 static inline bool InClass(char c) { \
120 return EXPRESSION; \
121 } \
122 }
123
124CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
kenton@google.com6f12e3e2009-12-22 18:11:09 +0000125 c == '\r' || c == '\v' || c == '\f');
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000126CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
127 c == '\r' || c == '\v' || c == '\f');
temporal40ee5512008-07-10 02:12:20 +0000128
kenton@google.comfccb1462009-12-18 02:11:36 +0000129CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
temporal40ee5512008-07-10 02:12:20 +0000130
131CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
132CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
133CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') ||
134 ('a' <= c && c <= 'f') ||
135 ('A' <= c && c <= 'F'));
136
137CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') ||
138 ('A' <= c && c <= 'Z') ||
139 (c == '_'));
140
141CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
142 ('A' <= c && c <= 'Z') ||
143 ('0' <= c && c <= '9') ||
144 (c == '_'));
145
146CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
147 c == 'r' || c == 't' || c == 'v' || c == '\\' ||
148 c == '?' || c == '\'' || c == '\"');
149
150#undef CHARACTER_CLASS
151
152// Given a char, interpret it as a numeric digit and return its value.
153// This supports any number base up to 36.
154inline int DigitValue(char digit) {
155 if ('0' <= digit && digit <= '9') return digit - '0';
156 if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
157 if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
158 return -1;
159}
160
161// Inline because it's only used in one place.
162inline char TranslateEscape(char c) {
163 switch (c) {
164 case 'a': return '\a';
165 case 'b': return '\b';
166 case 'f': return '\f';
167 case 'n': return '\n';
168 case 'r': return '\r';
169 case 't': return '\t';
170 case 'v': return '\v';
171 case '\\': return '\\';
172 case '?': return '\?'; // Trigraphs = :(
173 case '\'': return '\'';
174 case '"': return '\"';
175
176 // We expect escape sequences to have been validated separately.
177 default: return '?';
178 }
179}
180
181} // anonymous namespace
182
183ErrorCollector::~ErrorCollector() {}
184
185// ===================================================================
186
187Tokenizer::Tokenizer(ZeroCopyInputStream* input,
188 ErrorCollector* error_collector)
189 : input_(input),
190 error_collector_(error_collector),
191 buffer_(NULL),
192 buffer_size_(0),
193 buffer_pos_(0),
194 read_error_(false),
195 line_(0),
196 column_(0),
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000197 record_target_(NULL),
198 record_start_(-1),
temporal40ee5512008-07-10 02:12:20 +0000199 allow_f_after_float_(false),
jieluo@google.com4de8f552014-07-18 00:47:59 +0000200 comment_style_(CPP_COMMENT_STYLE),
201 require_space_after_number_(true),
202 allow_multiline_strings_(false) {
temporal40ee5512008-07-10 02:12:20 +0000203
204 current_.line = 0;
205 current_.column = 0;
liujisi@google.com33165fe2010-11-02 13:14:58 +0000206 current_.end_column = 0;
temporal40ee5512008-07-10 02:12:20 +0000207 current_.type = TYPE_START;
208
209 Refresh();
210}
211
212Tokenizer::~Tokenizer() {
213 // If we had any buffer left unread, return it to the underlying stream
214 // so that someone else can read it.
215 if (buffer_size_ > buffer_pos_) {
216 input_->BackUp(buffer_size_ - buffer_pos_);
217 }
218}
219
220// -------------------------------------------------------------------
221// Internal helpers.
222
223void Tokenizer::NextChar() {
224 // Update our line and column counters based on the character being
225 // consumed.
226 if (current_char_ == '\n') {
227 ++line_;
228 column_ = 0;
229 } else if (current_char_ == '\t') {
230 column_ += kTabWidth - column_ % kTabWidth;
231 } else {
232 ++column_;
233 }
234
235 // Advance to the next character.
236 ++buffer_pos_;
237 if (buffer_pos_ < buffer_size_) {
238 current_char_ = buffer_[buffer_pos_];
239 } else {
240 Refresh();
241 }
242}
243
244void Tokenizer::Refresh() {
245 if (read_error_) {
246 current_char_ = '\0';
247 return;
248 }
249
250 // If we're in a token, append the rest of the buffer to it.
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000251 if (record_target_ != NULL && record_start_ < buffer_size_) {
252 record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
253 record_start_ = 0;
temporal40ee5512008-07-10 02:12:20 +0000254 }
255
256 const void* data = NULL;
257 buffer_ = NULL;
258 buffer_pos_ = 0;
259 do {
260 if (!input_->Next(&data, &buffer_size_)) {
261 // end of stream (or read error)
262 buffer_size_ = 0;
263 read_error_ = true;
264 current_char_ = '\0';
265 return;
266 }
267 } while (buffer_size_ == 0);
268
269 buffer_ = static_cast<const char*>(data);
270
271 current_char_ = buffer_[0];
272}
273
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000274inline void Tokenizer::RecordTo(string* target) {
275 record_target_ = target;
276 record_start_ = buffer_pos_;
temporal40ee5512008-07-10 02:12:20 +0000277}
278
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000279inline void Tokenizer::StopRecording() {
temporal40ee5512008-07-10 02:12:20 +0000280 // Note: The if() is necessary because some STL implementations crash when
281 // you call string::append(NULL, 0), presumably because they are trying to
282 // be helpful by detecting the NULL pointer, even though there's nothing
283 // wrong with reading zero bytes from NULL.
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000284 if (buffer_pos_ != record_start_) {
285 record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
temporal40ee5512008-07-10 02:12:20 +0000286 }
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000287 record_target_ = NULL;
288 record_start_ = -1;
289}
290
291inline void Tokenizer::StartToken() {
292 current_.type = TYPE_START; // Just for the sake of initializing it.
293 current_.text.clear();
294 current_.line = line_;
295 current_.column = column_;
296 RecordTo(&current_.text);
297}
298
299inline void Tokenizer::EndToken() {
300 StopRecording();
liujisi@google.com33165fe2010-11-02 13:14:58 +0000301 current_.end_column = column_;
temporal40ee5512008-07-10 02:12:20 +0000302}
303
304// -------------------------------------------------------------------
305// Helper methods that consume characters.
306
307template<typename CharacterClass>
308inline bool Tokenizer::LookingAt() {
309 return CharacterClass::InClass(current_char_);
310}
311
312template<typename CharacterClass>
313inline bool Tokenizer::TryConsumeOne() {
314 if (CharacterClass::InClass(current_char_)) {
315 NextChar();
316 return true;
317 } else {
318 return false;
319 }
320}
321
322inline bool Tokenizer::TryConsume(char c) {
323 if (current_char_ == c) {
324 NextChar();
325 return true;
326 } else {
327 return false;
328 }
329}
330
331template<typename CharacterClass>
332inline void Tokenizer::ConsumeZeroOrMore() {
333 while (CharacterClass::InClass(current_char_)) {
334 NextChar();
335 }
336}
337
338template<typename CharacterClass>
339inline void Tokenizer::ConsumeOneOrMore(const char* error) {
340 if (!CharacterClass::InClass(current_char_)) {
341 AddError(error);
342 } else {
343 do {
344 NextChar();
345 } while (CharacterClass::InClass(current_char_));
346 }
347}
348
349// -------------------------------------------------------------------
350// Methods that read whole patterns matching certain kinds of tokens
351// or comments.
352
353void Tokenizer::ConsumeString(char delimiter) {
354 while (true) {
355 switch (current_char_) {
356 case '\0':
jieluo@google.com4de8f552014-07-18 00:47:59 +0000357 AddError("Unexpected end of string.");
temporal40ee5512008-07-10 02:12:20 +0000358 return;
jieluo@google.com4de8f552014-07-18 00:47:59 +0000359
360 case '\n': {
361 if (!allow_multiline_strings_) {
362 AddError("String literals cannot cross line boundaries.");
363 return;
364 }
365 NextChar();
366 break;
temporal40ee5512008-07-10 02:12:20 +0000367 }
368
369 case '\\': {
370 // An escape sequence.
371 NextChar();
372 if (TryConsumeOne<Escape>()) {
373 // Valid escape sequence.
374 } else if (TryConsumeOne<OctalDigit>()) {
375 // Possibly followed by two more octal digits, but these will
376 // just be consumed by the main loop anyway so we don't need
377 // to do so explicitly here.
Feng Xiaoe841bac2015-12-11 17:09:20 -0800378 } else if (TryConsume('x')) {
temporal40ee5512008-07-10 02:12:20 +0000379 if (!TryConsumeOne<HexDigit>()) {
380 AddError("Expected hex digits for escape sequence.");
381 }
382 // Possibly followed by another hex digit, but again we don't care.
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000383 } else if (TryConsume('u')) {
384 if (!TryConsumeOne<HexDigit>() ||
385 !TryConsumeOne<HexDigit>() ||
386 !TryConsumeOne<HexDigit>() ||
387 !TryConsumeOne<HexDigit>()) {
388 AddError("Expected four hex digits for \\u escape sequence.");
389 }
390 } else if (TryConsume('U')) {
391 // We expect 8 hex digits; but only the range up to 0x10ffff is
392 // legal.
393 if (!TryConsume('0') ||
394 !TryConsume('0') ||
395 !(TryConsume('0') || TryConsume('1')) ||
396 !TryConsumeOne<HexDigit>() ||
397 !TryConsumeOne<HexDigit>() ||
398 !TryConsumeOne<HexDigit>() ||
399 !TryConsumeOne<HexDigit>() ||
400 !TryConsumeOne<HexDigit>()) {
401 AddError("Expected eight hex digits up to 10ffff for \\U escape "
402 "sequence");
403 }
temporal40ee5512008-07-10 02:12:20 +0000404 } else {
405 AddError("Invalid escape sequence in string literal.");
406 }
407 break;
408 }
409
410 default: {
411 if (current_char_ == delimiter) {
412 NextChar();
413 return;
414 }
415 NextChar();
416 break;
417 }
418 }
419 }
420}
421
422Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
423 bool started_with_dot) {
424 bool is_float = false;
425
426 if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
427 // A hex number (started with "0x").
428 ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
429
430 } else if (started_with_zero && LookingAt<Digit>()) {
431 // An octal number (had a leading zero).
432 ConsumeZeroOrMore<OctalDigit>();
433 if (LookingAt<Digit>()) {
434 AddError("Numbers starting with leading zero must be in octal.");
435 ConsumeZeroOrMore<Digit>();
436 }
437
438 } else {
439 // A decimal number.
440 if (started_with_dot) {
441 is_float = true;
442 ConsumeZeroOrMore<Digit>();
443 } else {
444 ConsumeZeroOrMore<Digit>();
445
446 if (TryConsume('.')) {
447 is_float = true;
448 ConsumeZeroOrMore<Digit>();
449 }
450 }
451
452 if (TryConsume('e') || TryConsume('E')) {
453 is_float = true;
454 TryConsume('-') || TryConsume('+');
455 ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
456 }
457
458 if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
459 is_float = true;
460 }
461 }
462
jieluo@google.com4de8f552014-07-18 00:47:59 +0000463 if (LookingAt<Letter>() && require_space_after_number_) {
temporal40ee5512008-07-10 02:12:20 +0000464 AddError("Need space between number and identifier.");
465 } else if (current_char_ == '.') {
466 if (is_float) {
467 AddError(
468 "Already saw decimal point or exponent; can't have another one.");
469 } else {
470 AddError("Hex and octal numbers must be integers.");
471 }
472 }
473
474 return is_float ? TYPE_FLOAT : TYPE_INTEGER;
475}
476
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000477void Tokenizer::ConsumeLineComment(string* content) {
478 if (content != NULL) RecordTo(content);
479
temporal40ee5512008-07-10 02:12:20 +0000480 while (current_char_ != '\0' && current_char_ != '\n') {
481 NextChar();
482 }
483 TryConsume('\n');
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000484
485 if (content != NULL) StopRecording();
temporal40ee5512008-07-10 02:12:20 +0000486}
487
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000488void Tokenizer::ConsumeBlockComment(string* content) {
temporal40ee5512008-07-10 02:12:20 +0000489 int start_line = line_;
490 int start_column = column_ - 2;
491
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000492 if (content != NULL) RecordTo(content);
493
temporal40ee5512008-07-10 02:12:20 +0000494 while (true) {
495 while (current_char_ != '\0' &&
496 current_char_ != '*' &&
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000497 current_char_ != '/' &&
498 current_char_ != '\n') {
temporal40ee5512008-07-10 02:12:20 +0000499 NextChar();
500 }
501
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000502 if (TryConsume('\n')) {
503 if (content != NULL) StopRecording();
504
505 // Consume leading whitespace and asterisk;
506 ConsumeZeroOrMore<WhitespaceNoNewline>();
507 if (TryConsume('*')) {
508 if (TryConsume('/')) {
509 // End of comment.
510 break;
511 }
512 }
513
514 if (content != NULL) RecordTo(content);
515 } else if (TryConsume('*') && TryConsume('/')) {
temporal40ee5512008-07-10 02:12:20 +0000516 // End of comment.
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000517 if (content != NULL) {
518 StopRecording();
519 // Strip trailing "*/".
520 content->erase(content->size() - 2);
521 }
temporal40ee5512008-07-10 02:12:20 +0000522 break;
523 } else if (TryConsume('/') && current_char_ == '*') {
524 // Note: We didn't consume the '*' because if there is a '/' after it
525 // we want to interpret that as the end of the comment.
526 AddError(
527 "\"/*\" inside block comment. Block comments cannot be nested.");
528 } else if (current_char_ == '\0') {
529 AddError("End-of-file inside block comment.");
530 error_collector_->AddError(
531 start_line, start_column, " Comment started here.");
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000532 if (content != NULL) StopRecording();
temporal40ee5512008-07-10 02:12:20 +0000533 break;
534 }
535 }
536}
537
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000538Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
539 if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
540 if (TryConsume('/')) {
541 return LINE_COMMENT;
542 } else if (TryConsume('*')) {
543 return BLOCK_COMMENT;
544 } else {
545 // Oops, it was just a slash. Return it.
546 current_.type = TYPE_SYMBOL;
547 current_.text = "/";
548 current_.line = line_;
549 current_.column = column_ - 1;
550 current_.end_column = column_;
551 return SLASH_NOT_COMMENT;
552 }
553 } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
554 return LINE_COMMENT;
555 } else {
556 return NO_COMMENT;
557 }
558}
559
temporal40ee5512008-07-10 02:12:20 +0000560// -------------------------------------------------------------------
561
562bool Tokenizer::Next() {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000563 previous_ = current_;
temporal40ee5512008-07-10 02:12:20 +0000564
temporal40ee5512008-07-10 02:12:20 +0000565 while (!read_error_) {
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000566 ConsumeZeroOrMore<Whitespace>();
temporal40ee5512008-07-10 02:12:20 +0000567
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000568 switch (TryConsumeCommentStart()) {
569 case LINE_COMMENT:
570 ConsumeLineComment(NULL);
571 continue;
572 case BLOCK_COMMENT:
573 ConsumeBlockComment(NULL);
574 continue;
575 case SLASH_NOT_COMMENT:
temporal40ee5512008-07-10 02:12:20 +0000576 return true;
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000577 case NO_COMMENT:
578 break;
579 }
temporal40ee5512008-07-10 02:12:20 +0000580
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000581 // Check for EOF before continuing.
582 if (read_error_) break;
temporal40ee5512008-07-10 02:12:20 +0000583
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000584 if (LookingAt<Unprintable>() || current_char_ == '\0') {
temporal40ee5512008-07-10 02:12:20 +0000585 AddError("Invalid control characters encountered in text.");
586 NextChar();
587 // Skip more unprintable characters, too. But, remember that '\0' is
588 // also what current_char_ is set to after EOF / read error. We have
589 // to be careful not to go into an infinite loop of trying to consume
590 // it, so make sure to check read_error_ explicitly before consuming
591 // '\0'.
592 while (TryConsumeOne<Unprintable>() ||
593 (!read_error_ && TryConsume('\0'))) {
594 // Ignore.
595 }
596
597 } else {
598 // Reading some sort of token.
599 StartToken();
600
601 if (TryConsumeOne<Letter>()) {
602 ConsumeZeroOrMore<Alphanumeric>();
603 current_.type = TYPE_IDENTIFIER;
604 } else if (TryConsume('0')) {
605 current_.type = ConsumeNumber(true, false);
606 } else if (TryConsume('.')) {
607 // This could be the beginning of a floating-point number, or it could
608 // just be a '.' symbol.
609
610 if (TryConsumeOne<Digit>()) {
611 // It's a floating-point number.
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000612 if (previous_.type == TYPE_IDENTIFIER &&
613 current_.line == previous_.line &&
614 current_.column == previous_.end_column) {
temporal40ee5512008-07-10 02:12:20 +0000615 // We don't accept syntax like "blah.123".
616 error_collector_->AddError(line_, column_ - 2,
617 "Need space between identifier and decimal point.");
618 }
619 current_.type = ConsumeNumber(false, true);
620 } else {
621 current_.type = TYPE_SYMBOL;
622 }
623 } else if (TryConsumeOne<Digit>()) {
624 current_.type = ConsumeNumber(false, false);
625 } else if (TryConsume('\"')) {
626 ConsumeString('\"');
627 current_.type = TYPE_STRING;
628 } else if (TryConsume('\'')) {
629 ConsumeString('\'');
630 current_.type = TYPE_STRING;
631 } else {
jieluo@google.com4de8f552014-07-18 00:47:59 +0000632 // Check if the high order bit is set.
633 if (current_char_ & 0x80) {
634 error_collector_->AddError(line_, column_,
635 StringPrintf("Interpreting non ascii codepoint %d.",
636 static_cast<unsigned char>(current_char_)));
637 }
temporal40ee5512008-07-10 02:12:20 +0000638 NextChar();
639 current_.type = TYPE_SYMBOL;
640 }
641
642 EndToken();
643 return true;
644 }
temporal40ee5512008-07-10 02:12:20 +0000645 }
646
647 // EOF
648 current_.type = TYPE_END;
649 current_.text.clear();
650 current_.line = line_;
651 current_.column = column_;
liujisi@google.com33165fe2010-11-02 13:14:58 +0000652 current_.end_column = column_;
temporal40ee5512008-07-10 02:12:20 +0000653 return false;
654}
655
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000656namespace {
657
658// Helper class for collecting comments and putting them in the right places.
659//
660// This basically just buffers the most recent comment until it can be decided
661// exactly where that comment should be placed. When Flush() is called, the
662// current comment goes into either prev_trailing_comments or detached_comments.
663// When the CommentCollector is destroyed, the last buffered comment goes into
664// next_leading_comments.
665class CommentCollector {
666 public:
667 CommentCollector(string* prev_trailing_comments,
668 vector<string>* detached_comments,
669 string* next_leading_comments)
670 : prev_trailing_comments_(prev_trailing_comments),
671 detached_comments_(detached_comments),
672 next_leading_comments_(next_leading_comments),
673 has_comment_(false),
674 is_line_comment_(false),
675 can_attach_to_prev_(true) {
676 if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
677 if (detached_comments != NULL) detached_comments->clear();
678 if (next_leading_comments != NULL) next_leading_comments->clear();
679 }
680
681 ~CommentCollector() {
682 // Whatever is in the buffer is a leading comment.
683 if (next_leading_comments_ != NULL && has_comment_) {
684 comment_buffer_.swap(*next_leading_comments_);
685 }
686 }
687
688 // About to read a line comment. Get the comment buffer pointer in order to
689 // read into it.
690 string* GetBufferForLineComment() {
691 // We want to combine with previous line comments, but not block comments.
692 if (has_comment_ && !is_line_comment_) {
693 Flush();
694 }
695 has_comment_ = true;
696 is_line_comment_ = true;
697 return &comment_buffer_;
698 }
699
700 // About to read a block comment. Get the comment buffer pointer in order to
701 // read into it.
702 string* GetBufferForBlockComment() {
703 if (has_comment_) {
704 Flush();
705 }
706 has_comment_ = true;
707 is_line_comment_ = false;
708 return &comment_buffer_;
709 }
710
711 void ClearBuffer() {
712 comment_buffer_.clear();
713 has_comment_ = false;
714 }
715
716 // Called once we know that the comment buffer is complete and is *not*
717 // connected to the next token.
718 void Flush() {
719 if (has_comment_) {
720 if (can_attach_to_prev_) {
721 if (prev_trailing_comments_ != NULL) {
722 prev_trailing_comments_->append(comment_buffer_);
723 }
724 can_attach_to_prev_ = false;
725 } else {
726 if (detached_comments_ != NULL) {
727 detached_comments_->push_back(comment_buffer_);
728 }
729 }
730 ClearBuffer();
731 }
732 }
733
734 void DetachFromPrev() {
735 can_attach_to_prev_ = false;
736 }
737
738 private:
739 string* prev_trailing_comments_;
740 vector<string>* detached_comments_;
741 string* next_leading_comments_;
742
743 string comment_buffer_;
744
745 // True if any comments were read into comment_buffer_. This can be true even
746 // if comment_buffer_ is empty, namely if the comment was "/**/".
747 bool has_comment_;
748
749 // Is the comment in the comment buffer a line comment?
750 bool is_line_comment_;
751
752 // Is it still possible that we could be reading a comment attached to the
753 // previous token?
754 bool can_attach_to_prev_;
755};
756
757} // namespace
758
759bool Tokenizer::NextWithComments(string* prev_trailing_comments,
760 vector<string>* detached_comments,
761 string* next_leading_comments) {
762 CommentCollector collector(prev_trailing_comments, detached_comments,
763 next_leading_comments);
764
765 if (current_.type == TYPE_START) {
Jie Luob2d2cf82015-07-15 14:31:19 -0700766 // Ignore unicode byte order mark(BOM) if it appears at the file
767 // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
768 if (TryConsume((char)0xEF)) {
769 if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
770 AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
771 "Only UTF-8 is accepted for proto file.");
772 return false;
773 }
774 }
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000775 collector.DetachFromPrev();
776 } else {
777 // A comment appearing on the same line must be attached to the previous
778 // declaration.
779 ConsumeZeroOrMore<WhitespaceNoNewline>();
780 switch (TryConsumeCommentStart()) {
781 case LINE_COMMENT:
782 ConsumeLineComment(collector.GetBufferForLineComment());
783
784 // Don't allow comments on subsequent lines to be attached to a trailing
785 // comment.
786 collector.Flush();
787 break;
788 case BLOCK_COMMENT:
789 ConsumeBlockComment(collector.GetBufferForBlockComment());
790
791 ConsumeZeroOrMore<WhitespaceNoNewline>();
792 if (!TryConsume('\n')) {
793 // Oops, the next token is on the same line. If we recorded a comment
794 // we really have no idea which token it should be attached to.
795 collector.ClearBuffer();
796 return Next();
797 }
798
799 // Don't allow comments on subsequent lines to be attached to a trailing
800 // comment.
801 collector.Flush();
802 break;
803 case SLASH_NOT_COMMENT:
804 return true;
805 case NO_COMMENT:
806 if (!TryConsume('\n')) {
807 // The next token is on the same line. There are no comments.
808 return Next();
809 }
810 break;
811 }
812 }
813
814 // OK, we are now on the line *after* the previous token.
815 while (true) {
816 ConsumeZeroOrMore<WhitespaceNoNewline>();
817
818 switch (TryConsumeCommentStart()) {
819 case LINE_COMMENT:
820 ConsumeLineComment(collector.GetBufferForLineComment());
821 break;
822 case BLOCK_COMMENT:
823 ConsumeBlockComment(collector.GetBufferForBlockComment());
824
825 // Consume the rest of the line so that we don't interpret it as a
826 // blank line the next time around the loop.
827 ConsumeZeroOrMore<WhitespaceNoNewline>();
828 TryConsume('\n');
829 break;
830 case SLASH_NOT_COMMENT:
831 return true;
832 case NO_COMMENT:
833 if (TryConsume('\n')) {
834 // Completely blank line.
835 collector.Flush();
836 collector.DetachFromPrev();
837 } else {
838 bool result = Next();
839 if (!result ||
840 current_.text == "}" ||
841 current_.text == "]" ||
842 current_.text == ")") {
843 // It looks like we're at the end of a scope. In this case it
844 // makes no sense to attach a comment to the following token.
845 collector.Flush();
846 }
847 return result;
848 }
849 break;
850 }
851 }
852}
853
temporal40ee5512008-07-10 02:12:20 +0000854// -------------------------------------------------------------------
855// Token-parsing helpers. Remember that these don't need to report
856// errors since any errors should already have been reported while
857// tokenizing. Also, these can assume that whatever text they
858// are given is text that the tokenizer actually parsed as a token
859// of the given type.
860
861bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
862 uint64* output) {
863 // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
864 // is non-standard. I hate the C standard library. :(
865
866// return strtoull(text.c_str(), NULL, 0);
867
868 const char* ptr = text.c_str();
869 int base = 10;
870 if (ptr[0] == '0') {
kenton@google.com80b1d622009-07-29 01:13:20 +0000871 if (ptr[1] == 'x' || ptr[1] == 'X') {
temporal40ee5512008-07-10 02:12:20 +0000872 // This is hex.
873 base = 16;
874 ptr += 2;
875 } else {
876 // This is octal.
877 base = 8;
878 }
879 }
880
881 uint64 result = 0;
882 for (; *ptr != '\0'; ptr++) {
883 int digit = DigitValue(*ptr);
Jisi Liu3b3c8ab2016-03-30 11:39:59 -0700884 if (digit < 0 || digit >= base) {
885 // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
886 // token, but Tokenizer still think it's integer.
887 return false;
888 }
temporal40ee5512008-07-10 02:12:20 +0000889 if (digit > max_value || result > (max_value - digit) / base) {
890 // Overflow.
891 return false;
892 }
893 result = result * base + digit;
894 }
895
896 *output = result;
897 return true;
898}
899
900double Tokenizer::ParseFloat(const string& text) {
901 const char* start = text.c_str();
902 char* end;
903 double result = NoLocaleStrtod(start, &end);
904
905 // "1e" is not a valid float, but if the tokenizer reads it, it will
906 // report an error but still return it as a valid token. We need to
907 // accept anything the tokenizer could possibly return, error or not.
908 if (*end == 'e' || *end == 'E') {
909 ++end;
910 if (*end == '-' || *end == '+') ++end;
911 }
912
913 // If the Tokenizer had allow_f_after_float_ enabled, the float may be
914 // suffixed with the letter 'f'.
915 if (*end == 'f' || *end == 'F') {
916 ++end;
917 }
918
919 GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
920 << " Tokenizer::ParseFloat() passed text that could not have been"
921 " tokenized as a float: " << CEscape(text);
922 return result;
923}
924
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000925// Helper to append a Unicode code point to a string as UTF8, without bringing
926// in any external dependencies.
927static void AppendUTF8(uint32 code_point, string* output) {
928 uint32 tmp = 0;
929 int len = 0;
930 if (code_point <= 0x7f) {
931 tmp = code_point;
932 len = 1;
933 } else if (code_point <= 0x07ff) {
934 tmp = 0x0000c080 |
935 ((code_point & 0x07c0) << 2) |
936 (code_point & 0x003f);
937 len = 2;
938 } else if (code_point <= 0xffff) {
939 tmp = 0x00e08080 |
940 ((code_point & 0xf000) << 4) |
941 ((code_point & 0x0fc0) << 2) |
942 (code_point & 0x003f);
943 len = 3;
944 } else if (code_point <= 0x1fffff) {
945 tmp = 0xf0808080 |
946 ((code_point & 0x1c0000) << 6) |
947 ((code_point & 0x03f000) << 4) |
948 ((code_point & 0x000fc0) << 2) |
949 (code_point & 0x003f);
950 len = 4;
951 } else {
952 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
953 // normally only defined up to there as well.
954 StringAppendF(output, "\\U%08x", code_point);
955 return;
956 }
957 tmp = ghtonl(tmp);
958 output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
959}
960
961// Try to read <len> hex digits from ptr, and stuff the numeric result into
962// *result. Returns true if that many digits were successfully consumed.
963static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
964 *result = 0;
965 if (len == 0) return false;
966 for (const char* end = ptr + len; ptr < end; ++ptr) {
967 if (*ptr == '\0') return false;
968 *result = (*result << 4) + DigitValue(*ptr);
969 }
970 return true;
971}
972
973// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
974// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
975// surrogate. These numbers are in a reserved range of Unicode code points, so
976// if we encounter such a pair we know how to parse it and convert it into a
977// single code point.
978static const uint32 kMinHeadSurrogate = 0xd800;
979static const uint32 kMaxHeadSurrogate = 0xdc00;
980static const uint32 kMinTrailSurrogate = 0xdc00;
981static const uint32 kMaxTrailSurrogate = 0xe000;
982
983static inline bool IsHeadSurrogate(uint32 code_point) {
984 return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
985}
986
987static inline bool IsTrailSurrogate(uint32 code_point) {
988 return (code_point >= kMinTrailSurrogate) &&
989 (code_point < kMaxTrailSurrogate);
990}
991
992// Combine a head and trail surrogate into a single Unicode code point.
993static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
994 GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
995 GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
996 return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
997 (trail_surrogate - kMinTrailSurrogate));
998}
999
1000// Convert the escape sequence parameter to a number of expected hex digits.
1001static inline int UnicodeLength(char key) {
1002 if (key == 'u') return 4;
1003 if (key == 'U') return 8;
1004 return 0;
1005}
1006
1007// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
1008// to parse that sequence. On success, returns a pointer to the first char
1009// beyond that sequence, and fills in *code_point. On failure, returns ptr
1010// itself.
1011static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
1012 const char* p = ptr;
1013 // Fetch the code point.
1014 const int len = UnicodeLength(*p++);
1015 if (!ReadHexDigits(p, len, code_point))
1016 return ptr;
1017 p += len;
1018
1019 // Check if the code point we read is a "head surrogate." If so, then we
1020 // expect it to be immediately followed by another code point which is a valid
1021 // "trail surrogate," and together they form a UTF-16 pair which decodes into
1022 // a single Unicode point. Trail surrogates may only use \u, not \U.
1023 if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
1024 uint32 trail_surrogate;
1025 if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
1026 IsTrailSurrogate(trail_surrogate)) {
1027 *code_point = AssembleUTF16(*code_point, trail_surrogate);
1028 p += 6;
1029 }
1030 // If this failed, then we just emit the head surrogate as a code point.
1031 // It's bogus, but so is the string.
1032 }
1033
1034 return p;
1035}
1036
1037// The text string must begin and end with single or double quote
1038// characters.
kenton@google.com26bd9ee2008-11-21 00:06:27 +00001039void Tokenizer::ParseStringAppend(const string& text, string* output) {
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +00001040 // Reminder: text[0] is always a quote character. (If text is
1041 // empty, it's invalid, so we'll just return).
1042 const size_t text_size = text.size();
1043 if (text_size == 0) {
temporal40ee5512008-07-10 02:12:20 +00001044 GOOGLE_LOG(DFATAL)
kenton@google.com26bd9ee2008-11-21 00:06:27 +00001045 << " Tokenizer::ParseStringAppend() passed text that could not"
1046 " have been tokenized as a string: " << CEscape(text);
temporal40ee5512008-07-10 02:12:20 +00001047 return;
1048 }
1049
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +00001050 // Reserve room for new string. The branch is necessary because if
1051 // there is already space available the reserve() call might
1052 // downsize the output.
1053 const size_t new_len = text_size + output->size();
1054 if (new_len > output->capacity()) {
1055 output->reserve(new_len);
1056 }
temporal40ee5512008-07-10 02:12:20 +00001057
1058 // Loop through the string copying characters to "output" and
1059 // interpreting escape sequences. Note that any invalid escape
1060 // sequences or other errors were already reported while tokenizing.
1061 // In this case we do not need to produce valid results.
1062 for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
1063 if (*ptr == '\\' && ptr[1] != '\0') {
1064 // An escape sequence.
1065 ++ptr;
1066
1067 if (OctalDigit::InClass(*ptr)) {
1068 // An octal escape. May one, two, or three digits.
1069 int code = DigitValue(*ptr);
1070 if (OctalDigit::InClass(ptr[1])) {
1071 ++ptr;
1072 code = code * 8 + DigitValue(*ptr);
1073 }
1074 if (OctalDigit::InClass(ptr[1])) {
1075 ++ptr;
1076 code = code * 8 + DigitValue(*ptr);
1077 }
1078 output->push_back(static_cast<char>(code));
1079
1080 } else if (*ptr == 'x') {
1081 // A hex escape. May zero, one, or two digits. (The zero case
1082 // will have been caught as an error earlier.)
1083 int code = 0;
1084 if (HexDigit::InClass(ptr[1])) {
1085 ++ptr;
1086 code = DigitValue(*ptr);
1087 }
1088 if (HexDigit::InClass(ptr[1])) {
1089 ++ptr;
1090 code = code * 16 + DigitValue(*ptr);
1091 }
1092 output->push_back(static_cast<char>(code));
1093
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +00001094 } else if (*ptr == 'u' || *ptr == 'U') {
1095 uint32 unicode;
1096 const char* end = FetchUnicodePoint(ptr, &unicode);
1097 if (end == ptr) {
1098 // Failure: Just dump out what we saw, don't try to parse it.
1099 output->push_back(*ptr);
1100 } else {
1101 AppendUTF8(unicode, output);
1102 ptr = end - 1; // Because we're about to ++ptr.
1103 }
temporal40ee5512008-07-10 02:12:20 +00001104 } else {
1105 // Some other escape code.
1106 output->push_back(TranslateEscape(*ptr));
1107 }
1108
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +00001109 } else if (*ptr == text[0] && ptr[1] == '\0') {
1110 // Ignore final quote matching the starting quote.
temporal40ee5512008-07-10 02:12:20 +00001111 } else {
1112 output->push_back(*ptr);
1113 }
1114 }
temporal40ee5512008-07-10 02:12:20 +00001115}
1116
jieluo@google.com4de8f552014-07-18 00:47:59 +00001117template<typename CharacterClass>
1118static bool AllInClass(const string& s) {
1119 for (int i = 0; i < s.size(); ++i) {
1120 if (!CharacterClass::InClass(s[i]))
1121 return false;
1122 }
1123 return true;
1124}
1125
1126bool Tokenizer::IsIdentifier(const string& text) {
1127 // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
1128 if (text.size() == 0)
1129 return false;
1130 if (!Letter::InClass(text.at(0)))
1131 return false;
1132 if (!AllInClass<Alphanumeric>(text.substr(1)))
1133 return false;
1134 return true;
1135}
1136
temporal40ee5512008-07-10 02:12:20 +00001137} // namespace io
1138} // namespace protobuf
1139} // namespace google