blob: ae0811f892692c20137b3a9001a5c1ceeb052ed8 [file] [log] [blame]
temporal40ee5512008-07-10 02:12:20 +00001// Protocol Buffers - Google's data interchange format
kenton@google.com24bf56f2008-09-24 20:31:01 +00002// Copyright 2008 Google Inc. All rights reserved.
Feng Xiaoe4288622014-10-01 16:26:23 -07003// https://developers.google.com/protocol-buffers/
temporal40ee5512008-07-10 02:12:20 +00004//
kenton@google.com24bf56f2008-09-24 20:31:01 +00005// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
temporal40ee5512008-07-10 02:12:20 +00008//
kenton@google.com24bf56f2008-09-24 20:31:01 +00009// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
temporal40ee5512008-07-10 02:12:20 +000018//
kenton@google.com24bf56f2008-09-24 20:31:01 +000019// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
temporal40ee5512008-07-10 02:12:20 +000030
31// Author: kenton@google.com (Kenton Varda)
32// Based on original Protocol Buffers design by
33// Sanjay Ghemawat, Jeff Dean, and others.
34
temporal40ee5512008-07-10 02:12:20 +000035#include <limits.h>
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +000036#include <math.h>
37
38#include <vector>
temporal40ee5512008-07-10 02:12:20 +000039
40#include <google/protobuf/io/tokenizer.h>
41#include <google/protobuf/io/zero_copy_stream_impl.h>
42
43#include <google/protobuf/stubs/common.h>
Feng Xiaoeee38b02015-08-22 18:25:48 -070044#include <google/protobuf/stubs/logging.h>
temporal40ee5512008-07-10 02:12:20 +000045#include <google/protobuf/stubs/strutil.h>
46#include <google/protobuf/stubs/substitute.h>
47#include <google/protobuf/testing/googletest.h>
48#include <gtest/gtest.h>
49
50namespace google {
51namespace protobuf {
52namespace io {
53namespace {
54
55// ===================================================================
56// Data-Driven Test Infrastructure
57
58// TODO(kenton): This is copied from coded_stream_unittest. This is
59// temporary until these fetaures are integrated into gTest itself.
60
61// TEST_1D and TEST_2D are macros I'd eventually like to see added to
62// gTest. These macros can be used to declare tests which should be
63// run multiple times, once for each item in some input array. TEST_1D
64// tests all cases in a single input array. TEST_2D tests all
65// combinations of cases from two arrays. The arrays must be statically
66// defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
67//
68// int kCases[] = {1, 2, 3, 4}
69// TEST_1D(MyFixture, MyTest, kCases) {
70// EXPECT_GT(kCases_case, 0);
71// }
72//
73// This test iterates through the numbers 1, 2, 3, and 4 and tests that
74// they are all grater than zero. In case of failure, the exact case
75// which failed will be printed. The case type must be printable using
76// ostream::operator<<.
77
78#define TEST_1D(FIXTURE, NAME, CASES) \
79 class FIXTURE##_##NAME##_DD : public FIXTURE { \
80 protected: \
81 template <typename CaseType> \
82 void DoSingleCase(const CaseType& CASES##_case); \
83 }; \
84 \
85 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
86 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
87 SCOPED_TRACE(testing::Message() \
88 << #CASES " case #" << i << ": " << CASES[i]); \
89 DoSingleCase(CASES[i]); \
90 } \
91 } \
92 \
93 template <typename CaseType> \
94 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
95
96#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
97 class FIXTURE##_##NAME##_DD : public FIXTURE { \
98 protected: \
99 template <typename CaseType1, typename CaseType2> \
100 void DoSingleCase(const CaseType1& CASES1##_case, \
101 const CaseType2& CASES2##_case); \
102 }; \
103 \
104 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
105 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
106 for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
107 SCOPED_TRACE(testing::Message() \
108 << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
109 << #CASES2 " case #" << j << ": " << CASES2[j]); \
110 DoSingleCase(CASES1[i], CASES2[j]); \
111 } \
112 } \
113 } \
114 \
115 template <typename CaseType1, typename CaseType2> \
116 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
117 const CaseType2& CASES2##_case)
118
119// -------------------------------------------------------------------
120
121// An input stream that is basically like an ArrayInputStream but sometimes
122// returns empty buffers, just to throw us off.
123class TestInputStream : public ZeroCopyInputStream {
124 public:
125 TestInputStream(const void* data, int size, int block_size)
126 : array_stream_(data, size, block_size), counter_(0) {}
127 ~TestInputStream() {}
128
129 // implements ZeroCopyInputStream ----------------------------------
130 bool Next(const void** data, int* size) {
131 // We'll return empty buffers starting with the first buffer, and every
132 // 3 and 5 buffers after that.
133 if (counter_ % 3 == 0 || counter_ % 5 == 0) {
134 *data = NULL;
135 *size = 0;
136 ++counter_;
137 return true;
138 } else {
139 ++counter_;
140 return array_stream_.Next(data, size);
141 }
142 }
143
144 void BackUp(int count) { return array_stream_.BackUp(count); }
145 bool Skip(int count) { return array_stream_.Skip(count); }
146 int64 ByteCount() const { return array_stream_.ByteCount(); }
147
148 private:
149 ArrayInputStream array_stream_;
150 int counter_;
151};
152
153// -------------------------------------------------------------------
154
155// An error collector which simply concatenates all its errors into a big
156// block of text which can be checked.
157class TestErrorCollector : public ErrorCollector {
158 public:
159 TestErrorCollector() {}
160 ~TestErrorCollector() {}
161
162 string text_;
163
164 // implements ErrorCollector ---------------------------------------
165 void AddError(int line, int column, const string& message) {
166 strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
167 line, column, message);
168 }
169};
170
171// -------------------------------------------------------------------
172
173// We test each operation over a variety of block sizes to insure that
174// we test cases where reads cross buffer boundaries as well as cases
175// where they don't. This is sort of a brute-force approach to this,
176// but it's easy to write and easy to understand.
177const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
178
179class TokenizerTest : public testing::Test {
180 protected:
181 // For easy testing.
182 uint64 ParseInteger(const string& text) {
183 uint64 result;
184 EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
185 return result;
186 }
187};
188
189// ===================================================================
190
191// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
192// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
193#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
194
195// In each test case, the entire input text should parse as a single token
196// of the given type.
197struct SimpleTokenCase {
198 string input;
199 Tokenizer::TokenType type;
200};
201
202inline ostream& operator<<(ostream& out,
203 const SimpleTokenCase& test_case) {
204 return out << CEscape(test_case.input);
205}
206
207SimpleTokenCase kSimpleTokenCases[] = {
208 // Test identifiers.
209 { "hello", Tokenizer::TYPE_IDENTIFIER },
210
211 // Test integers.
212 { "123", Tokenizer::TYPE_INTEGER },
213 { "0xab6", Tokenizer::TYPE_INTEGER },
214 { "0XAB6", Tokenizer::TYPE_INTEGER },
215 { "0X1234567", Tokenizer::TYPE_INTEGER },
216 { "0x89abcdef", Tokenizer::TYPE_INTEGER },
217 { "0x89ABCDEF", Tokenizer::TYPE_INTEGER },
218 { "01234567", Tokenizer::TYPE_INTEGER },
219
220 // Test floats.
221 { "123.45", Tokenizer::TYPE_FLOAT },
222 { "1.", Tokenizer::TYPE_FLOAT },
223 { "1e3", Tokenizer::TYPE_FLOAT },
224 { "1E3", Tokenizer::TYPE_FLOAT },
225 { "1e-3", Tokenizer::TYPE_FLOAT },
226 { "1e+3", Tokenizer::TYPE_FLOAT },
227 { "1.e3", Tokenizer::TYPE_FLOAT },
228 { "1.2e3", Tokenizer::TYPE_FLOAT },
229 { ".1", Tokenizer::TYPE_FLOAT },
230 { ".1e3", Tokenizer::TYPE_FLOAT },
231 { ".1e-3", Tokenizer::TYPE_FLOAT },
232 { ".1e+3", Tokenizer::TYPE_FLOAT },
233
234 // Test strings.
235 { "'hello'", Tokenizer::TYPE_STRING },
236 { "\"foo\"", Tokenizer::TYPE_STRING },
237 { "'a\"b'", Tokenizer::TYPE_STRING },
238 { "\"a'b\"", Tokenizer::TYPE_STRING },
239 { "'a\\'b'", Tokenizer::TYPE_STRING },
240 { "\"a\\\"b\"", Tokenizer::TYPE_STRING },
241 { "'\\xf'", Tokenizer::TYPE_STRING },
242 { "'\\0'", Tokenizer::TYPE_STRING },
243
244 // Test symbols.
245 { "+", Tokenizer::TYPE_SYMBOL },
246 { ".", Tokenizer::TYPE_SYMBOL },
247};
248
249TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
250 // Set up the tokenizer.
251 TestInputStream input(kSimpleTokenCases_case.input.data(),
252 kSimpleTokenCases_case.input.size(),
253 kBlockSizes_case);
254 TestErrorCollector error_collector;
255 Tokenizer tokenizer(&input, &error_collector);
256
257 // Before Next() is called, the initial token should always be TYPE_START.
258 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
259 EXPECT_EQ("", tokenizer.current().text);
260 EXPECT_EQ(0, tokenizer.current().line);
261 EXPECT_EQ(0, tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000262 EXPECT_EQ(0, tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000263
264 // Parse the token.
265 ASSERT_TRUE(tokenizer.Next());
266
267 // Check that it has the right type.
268 EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
269 // Check that it contains the complete input text.
270 EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
271 // Check that it is located at the beginning of the input
272 EXPECT_EQ(0, tokenizer.current().line);
273 EXPECT_EQ(0, tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000274 EXPECT_EQ(kSimpleTokenCases_case.input.size(),
275 tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000276
277 // There should be no more input.
278 EXPECT_FALSE(tokenizer.Next());
279
280 // After Next() returns false, the token should have type TYPE_END.
281 EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
282 EXPECT_EQ("", tokenizer.current().text);
283 EXPECT_EQ(0, tokenizer.current().line);
284 EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000285 EXPECT_EQ(kSimpleTokenCases_case.input.size(),
286 tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000287
288 // There should be no errors.
289 EXPECT_TRUE(error_collector.text_.empty());
290}
291
292TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
293 // Test the "allow_f_after_float" option.
294
295 // Set up the tokenizer.
296 const char* text = "1f 2.5f 6e3f 7F";
297 TestInputStream input(text, strlen(text), kBlockSizes_case);
298 TestErrorCollector error_collector;
299 Tokenizer tokenizer(&input, &error_collector);
300 tokenizer.set_allow_f_after_float(true);
301
302 // Advance through tokens and check that they are parsed as expected.
303 ASSERT_TRUE(tokenizer.Next());
304 EXPECT_EQ(tokenizer.current().text, "1f");
305 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
306 ASSERT_TRUE(tokenizer.Next());
307 EXPECT_EQ(tokenizer.current().text, "2.5f");
308 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
309 ASSERT_TRUE(tokenizer.Next());
310 EXPECT_EQ(tokenizer.current().text, "6e3f");
311 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
312 ASSERT_TRUE(tokenizer.Next());
313 EXPECT_EQ(tokenizer.current().text, "7F");
314 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
315
316 // There should be no more input.
317 EXPECT_FALSE(tokenizer.Next());
318 // There should be no errors.
319 EXPECT_TRUE(error_collector.text_.empty());
320}
321
322#endif
323
324// -------------------------------------------------------------------
325
326// In each case, the input is parsed to produce a list of tokens. The
327// last token in "output" must have type TYPE_END.
328struct MultiTokenCase {
329 string input;
330 Tokenizer::Token output[10]; // The compiler wants a constant array
331 // size for initialization to work. There
332 // is no reason this can't be increased if
333 // needed.
334};
335
336inline ostream& operator<<(ostream& out,
337 const MultiTokenCase& test_case) {
338 return out << CEscape(test_case.input);
339}
340
341MultiTokenCase kMultiTokenCases[] = {
342 // Test empty input.
343 { "", {
344 { Tokenizer::TYPE_END , "" , 0, 0 },
345 }},
346
347 // Test all token types at the same time.
348 { "foo 1 1.2 + 'bar'", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000349 { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 },
350 { Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 },
351 { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 },
352 { Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 },
353 { Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 },
354 { Tokenizer::TYPE_END , "" , 0, 17, 17 },
temporal40ee5512008-07-10 02:12:20 +0000355 }},
356
357 // Test that consecutive symbols are parsed as separate tokens.
358 { "!@+%", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000359 { Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 },
360 { Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 },
361 { Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 },
362 { Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 },
363 { Tokenizer::TYPE_END , "" , 0, 4, 4 },
temporal40ee5512008-07-10 02:12:20 +0000364 }},
365
366 // Test that newlines affect line numbers correctly.
367 { "foo bar\nrab oof", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000368 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
369 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 },
370 { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 },
371 { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 },
372 { Tokenizer::TYPE_END , "" , 1, 7, 7 },
temporal40ee5512008-07-10 02:12:20 +0000373 }},
374
375 // Test that tabs affect column numbers correctly.
376 { "foo\tbar \tbaz", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000377 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
378 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 },
379 { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
380 { Tokenizer::TYPE_END , "" , 0, 19, 19 },
381 }},
382
383 // Test that tabs in string literals affect column numbers correctly.
384 { "\"foo\tbar\" baz", {
385 { Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 },
386 { Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 },
387 { Tokenizer::TYPE_END , "" , 0, 16, 16 },
temporal40ee5512008-07-10 02:12:20 +0000388 }},
389
390 // Test that line comments are ignored.
391 { "foo // This is a comment\n"
392 "bar // This is another comment", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000393 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
394 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 },
395 { Tokenizer::TYPE_END , "" , 1, 30, 30 },
temporal40ee5512008-07-10 02:12:20 +0000396 }},
397
398 // Test that block comments are ignored.
399 { "foo /* This is a block comment */ bar", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000400 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
401 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
402 { Tokenizer::TYPE_END , "" , 0, 37, 37 },
temporal40ee5512008-07-10 02:12:20 +0000403 }},
404
405 // Test that sh-style comments are not ignored by default.
406 { "foo # bar\n"
407 "baz", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000408 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
409 { Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 },
410 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
411 { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
412 { Tokenizer::TYPE_END , "" , 1, 3, 3 },
temporal40ee5512008-07-10 02:12:20 +0000413 }},
kenton@google.comfccb1462009-12-18 02:11:36 +0000414
kenton@google.com6f12e3e2009-12-22 18:11:09 +0000415 // Test all whitespace chars
416 { "foo\n\t\r\v\fbar", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000417 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
418 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
419 { Tokenizer::TYPE_END , "" , 1, 14, 14 },
kenton@google.com6f12e3e2009-12-22 18:11:09 +0000420 }},
temporal40ee5512008-07-10 02:12:20 +0000421};
422
423TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
424 // Set up the tokenizer.
425 TestInputStream input(kMultiTokenCases_case.input.data(),
426 kMultiTokenCases_case.input.size(),
427 kBlockSizes_case);
428 TestErrorCollector error_collector;
429 Tokenizer tokenizer(&input, &error_collector);
430
431 // Before Next() is called, the initial token should always be TYPE_START.
432 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
433 EXPECT_EQ("", tokenizer.current().text);
434 EXPECT_EQ(0, tokenizer.current().line);
435 EXPECT_EQ(0, tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000436 EXPECT_EQ(0, tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000437
438 // Loop through all expected tokens.
439 int i = 0;
440 Tokenizer::Token token;
441 do {
442 token = kMultiTokenCases_case.output[i++];
443
444 SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
445
liujisi@google.com33165fe2010-11-02 13:14:58 +0000446 Tokenizer::Token previous = tokenizer.current();
447
temporal40ee5512008-07-10 02:12:20 +0000448 // Next() should only return false when it hits the end token.
449 if (token.type != Tokenizer::TYPE_END) {
450 ASSERT_TRUE(tokenizer.Next());
451 } else {
452 ASSERT_FALSE(tokenizer.Next());
453 }
454
liujisi@google.com33165fe2010-11-02 13:14:58 +0000455 // Check that the previous token is set correctly.
456 EXPECT_EQ(previous.type, tokenizer.previous().type);
457 EXPECT_EQ(previous.text, tokenizer.previous().text);
458 EXPECT_EQ(previous.line, tokenizer.previous().line);
459 EXPECT_EQ(previous.column, tokenizer.previous().column);
460 EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
461
temporal40ee5512008-07-10 02:12:20 +0000462 // Check that the token matches the expected one.
463 EXPECT_EQ(token.type, tokenizer.current().type);
464 EXPECT_EQ(token.text, tokenizer.current().text);
465 EXPECT_EQ(token.line, tokenizer.current().line);
466 EXPECT_EQ(token.column, tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000467 EXPECT_EQ(token.end_column, tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000468
469 } while (token.type != Tokenizer::TYPE_END);
470
471 // There should be no errors.
472 EXPECT_TRUE(error_collector.text_.empty());
473}
474
475// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
476// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
477#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
478
479TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
480 // Test the "comment_style" option.
481
482 const char* text = "foo # bar\n"
483 "baz // qux\n"
484 "corge /* grault */\n"
485 "garply";
486 const char* const kTokens[] = {"foo", // "# bar" is ignored
487 "baz", "/", "/", "qux",
488 "corge", "/", "*", "grault", "*", "/",
489 "garply"};
490
491 // Set up the tokenizer.
492 TestInputStream input(text, strlen(text), kBlockSizes_case);
493 TestErrorCollector error_collector;
494 Tokenizer tokenizer(&input, &error_collector);
495 tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
496
497 // Advance through tokens and check that they are parsed as expected.
498 for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
499 EXPECT_TRUE(tokenizer.Next());
500 EXPECT_EQ(tokenizer.current().text, kTokens[i]);
501 }
502
503 // There should be no more input.
504 EXPECT_FALSE(tokenizer.Next());
505 // There should be no errors.
506 EXPECT_TRUE(error_collector.text_.empty());
507}
508
509#endif
510
511// -------------------------------------------------------------------
512
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000513// In each case, the input is expected to have two tokens named "prev" and
514// "next" with comments in between.
515struct DocCommentCase {
516 string input;
517
518 const char* prev_trailing_comments;
519 const char* detached_comments[10];
520 const char* next_leading_comments;
521};
522
523inline ostream& operator<<(ostream& out,
524 const DocCommentCase& test_case) {
525 return out << CEscape(test_case.input);
526}
527
528DocCommentCase kDocCommentCases[] = {
529 {
530 "prev next",
531
532 "",
533 {},
534 ""
535 },
536
537 {
538 "prev /* ignored */ next",
539
540 "",
541 {},
542 ""
543 },
544
545 {
546 "prev // trailing comment\n"
547 "next",
548
549 " trailing comment\n",
550 {},
551 ""
552 },
553
554 {
555 "prev\n"
556 "// leading comment\n"
557 "// line 2\n"
558 "next",
559
560 "",
561 {},
562 " leading comment\n"
563 " line 2\n"
564 },
565
566 {
567 "prev\n"
568 "// trailing comment\n"
569 "// line 2\n"
570 "\n"
571 "next",
572
573 " trailing comment\n"
574 " line 2\n",
575 {},
576 ""
577 },
578
579 {
580 "prev // trailing comment\n"
581 "// leading comment\n"
582 "// line 2\n"
583 "next",
584
585 " trailing comment\n",
586 {},
587 " leading comment\n"
588 " line 2\n"
589 },
590
591 {
592 "prev /* trailing block comment */\n"
593 "/* leading block comment\n"
594 " * line 2\n"
595 " * line 3 */"
596 "next",
597
598 " trailing block comment ",
599 {},
600 " leading block comment\n"
601 " line 2\n"
602 " line 3 "
603 },
604
605 {
606 "prev\n"
607 "/* trailing block comment\n"
608 " * line 2\n"
609 " * line 3\n"
610 " */\n"
611 "/* leading block comment\n"
612 " * line 2\n"
613 " * line 3 */"
614 "next",
615
616 " trailing block comment\n"
617 " line 2\n"
618 " line 3\n",
619 {},
620 " leading block comment\n"
621 " line 2\n"
622 " line 3 "
623 },
624
625 {
626 "prev\n"
627 "// trailing comment\n"
628 "\n"
629 "// detached comment\n"
630 "// line 2\n"
631 "\n"
632 "// second detached comment\n"
633 "/* third detached comment\n"
634 " * line 2 */\n"
635 "// leading comment\n"
636 "next",
637
638 " trailing comment\n",
639 {
640 " detached comment\n"
641 " line 2\n",
642 " second detached comment\n",
643 " third detached comment\n"
644 " line 2 "
645 },
646 " leading comment\n"
647 },
648
649 {
650 "prev /**/\n"
651 "\n"
652 "// detached comment\n"
653 "\n"
654 "// leading comment\n"
655 "next",
656
657 "",
658 {
659 " detached comment\n"
660 },
661 " leading comment\n"
662 },
663
664 {
665 "prev /**/\n"
666 "// leading comment\n"
667 "next",
668
669 "",
670 {},
671 " leading comment\n"
672 },
673 };
674
675TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
676 // Set up the tokenizer.
677 TestInputStream input(kDocCommentCases_case.input.data(),
678 kDocCommentCases_case.input.size(),
679 kBlockSizes_case);
680 TestErrorCollector error_collector;
681 Tokenizer tokenizer(&input, &error_collector);
682
683 // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
684 TestInputStream input2(kDocCommentCases_case.input.data(),
685 kDocCommentCases_case.input.size(),
686 kBlockSizes_case);
687 Tokenizer tokenizer2(&input2, &error_collector);
688
689 tokenizer.Next();
690 tokenizer2.Next();
691
692 EXPECT_EQ("prev", tokenizer.current().text);
693 EXPECT_EQ("prev", tokenizer2.current().text);
694
695 string prev_trailing_comments;
696 vector<string> detached_comments;
697 string next_leading_comments;
698 tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
699 &next_leading_comments);
700 tokenizer2.NextWithComments(NULL, NULL, NULL);
701 EXPECT_EQ("next", tokenizer.current().text);
702 EXPECT_EQ("next", tokenizer2.current().text);
703
704 EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
705 prev_trailing_comments);
706
707 for (int i = 0; i < detached_comments.size(); i++) {
708 ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
709 ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
710 EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
711 detached_comments[i]);
712 }
713
714 // Verify that we matched all the detached comments.
715 EXPECT_EQ(NULL,
716 kDocCommentCases_case.detached_comments[detached_comments.size()]);
717
718 EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
719 next_leading_comments);
720}
721
722// -------------------------------------------------------------------
723
temporal40ee5512008-07-10 02:12:20 +0000724// Test parse helpers. It's not really worth setting up a full data-driven
725// test here.
726TEST_F(TokenizerTest, ParseInteger) {
727 EXPECT_EQ(0, ParseInteger("0"));
728 EXPECT_EQ(123, ParseInteger("123"));
729 EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
730 EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
731 EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
732 EXPECT_EQ(01234567, ParseInteger("01234567"));
kenton@google.com80b1d622009-07-29 01:13:20 +0000733 EXPECT_EQ(0X123, ParseInteger("0X123"));
temporal40ee5512008-07-10 02:12:20 +0000734
735 // Test invalid integers that may still be tokenized as integers.
736 EXPECT_EQ(0, ParseInteger("0x"));
737
temporalf2063512008-07-23 01:19:07 +0000738 uint64 i;
Jisi Liu3b3c8ab2016-03-30 11:39:59 -0700739
temporal40ee5512008-07-10 02:12:20 +0000740 // Test invalid integers that will never be tokenized as integers.
Jisi Liu3b3c8ab2016-03-30 11:39:59 -0700741 EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i));
742 EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i));
743 EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i));
744 EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i));
745 EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i));
temporal40ee5512008-07-10 02:12:20 +0000746
747 // Test overflows.
temporal40ee5512008-07-10 02:12:20 +0000748 EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
749 EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
750 EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
751 EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
752 EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
753 EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
754 EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
755}
756
757TEST_F(TokenizerTest, ParseFloat) {
758 EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1."));
759 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3"));
760 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3"));
761 EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
762 EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1"));
763 EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25"));
764 EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
765 EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
766 EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
767 EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
768 EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5"));
769 EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
770 EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2"));
771 EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
772
773 // Test invalid integers that may still be tokenized as integers.
774 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
775 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
776 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
777
778 // Test 'f' suffix.
779 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
780 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
781 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
782
783 // These should parse successfully even though they are out of range.
784 // Overflows become infinity and underflows become zero.
785 EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
786 EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
787
jieluo@google.com4de8f552014-07-18 00:47:59 +0000788#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
temporal40ee5512008-07-10 02:12:20 +0000789 // Test invalid integers that will never be tokenized as integers.
790 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
791 "passed text that could not have been tokenized as a float");
792 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
793 "passed text that could not have been tokenized as a float");
794 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
795 "passed text that could not have been tokenized as a float");
jieluo@google.com4de8f552014-07-18 00:47:59 +0000796#endif // PROTOBUF_HAS_DEATH_TEST
temporal40ee5512008-07-10 02:12:20 +0000797}
798
799TEST_F(TokenizerTest, ParseString) {
800 string output;
801 Tokenizer::ParseString("'hello'", &output);
802 EXPECT_EQ("hello", output);
803 Tokenizer::ParseString("\"blah\\nblah2\"", &output);
804 EXPECT_EQ("blah\nblah2", output);
805 Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
806 EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
807 Tokenizer::ParseString("'\\x20\\x4'", &output);
808 EXPECT_EQ("\x20\x4", output);
809
810 // Test invalid strings that may still be tokenized as strings.
811 Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
812 EXPECT_EQ("\a?\v\t", output);
813 Tokenizer::ParseString("'", &output);
814 EXPECT_EQ("", output);
815 Tokenizer::ParseString("'\\", &output);
816 EXPECT_EQ("\\", output);
817
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000818 // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
819 // characters.
820 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
821 EXPECT_EQ("$¢€𤭢XX", output);
822 // Same thing encoded using UTF16.
823 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
824 EXPECT_EQ("$¢€𤭢XX", output);
825 // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
826 // We just output this as if it were UTF8; it's not a defined code point, but
827 // it has a defined encoding.
828 Tokenizer::ParseString("'\\ud852XX'", &output);
829 EXPECT_EQ("\xed\xa1\x92XX", output);
830 // Malformed escape: Demons may fly out of the nose.
831 Tokenizer::ParseString("\\u0", &output);
832 EXPECT_EQ("u0", output);
833
temporal40ee5512008-07-10 02:12:20 +0000834 // Test invalid strings that will never be tokenized as strings.
jieluo@google.com4de8f552014-07-18 00:47:59 +0000835#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet
temporal40ee5512008-07-10 02:12:20 +0000836 EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
837 "passed text that could not have been tokenized as a string");
jieluo@google.com4de8f552014-07-18 00:47:59 +0000838#endif // PROTOBUF_HAS_DEATH_TEST
temporal40ee5512008-07-10 02:12:20 +0000839}
840
kenton@google.com26bd9ee2008-11-21 00:06:27 +0000841TEST_F(TokenizerTest, ParseStringAppend) {
842 // Check that ParseString and ParseStringAppend differ.
843 string output("stuff+");
844 Tokenizer::ParseStringAppend("'hello'", &output);
845 EXPECT_EQ("stuff+hello", output);
846 Tokenizer::ParseString("'hello'", &output);
847 EXPECT_EQ("hello", output);
848}
849
temporal40ee5512008-07-10 02:12:20 +0000850// -------------------------------------------------------------------
851
852// Each case parses some input text, ignoring the tokens produced, and
853// checks that the error output matches what is expected.
854struct ErrorCase {
855 string input;
856 bool recoverable; // True if the tokenizer should be able to recover and
857 // parse more tokens after seeing this error. Cases
858 // for which this is true must end with "foo" as
859 // the last token, which the test will check for.
860 const char* errors;
861};
862
863inline ostream& operator<<(ostream& out,
864 const ErrorCase& test_case) {
865 return out << CEscape(test_case.input);
866}
867
868ErrorCase kErrorCases[] = {
869 // String errors.
870 { "'\\l' foo", true,
871 "0:2: Invalid escape sequence in string literal.\n" },
Feng Xiaoe841bac2015-12-11 17:09:20 -0800872 { "'\\X' foo", true,
873 "0:2: Invalid escape sequence in string literal.\n" },
temporal40ee5512008-07-10 02:12:20 +0000874 { "'\\x' foo", true,
875 "0:3: Expected hex digits for escape sequence.\n" },
876 { "'foo", false,
jieluo@google.com4de8f552014-07-18 00:47:59 +0000877 "0:4: Unexpected end of string.\n" },
temporal40ee5512008-07-10 02:12:20 +0000878 { "'bar\nfoo", true,
879 "0:4: String literals cannot cross line boundaries.\n" },
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000880 { "'\\u01' foo", true,
881 "0:5: Expected four hex digits for \\u escape sequence.\n" },
882 { "'\\u01' foo", true,
883 "0:5: Expected four hex digits for \\u escape sequence.\n" },
884 { "'\\uXYZ' foo", true,
885 "0:3: Expected four hex digits for \\u escape sequence.\n" },
temporal40ee5512008-07-10 02:12:20 +0000886
887 // Integer errors.
888 { "123foo", true,
889 "0:3: Need space between number and identifier.\n" },
890
891 // Hex/octal errors.
892 { "0x foo", true,
893 "0:2: \"0x\" must be followed by hex digits.\n" },
894 { "0541823 foo", true,
895 "0:4: Numbers starting with leading zero must be in octal.\n" },
896 { "0x123z foo", true,
897 "0:5: Need space between number and identifier.\n" },
898 { "0x123.4 foo", true,
899 "0:5: Hex and octal numbers must be integers.\n" },
900 { "0123.4 foo", true,
901 "0:4: Hex and octal numbers must be integers.\n" },
902
903 // Float errors.
904 { "1e foo", true,
905 "0:2: \"e\" must be followed by exponent.\n" },
906 { "1e- foo", true,
907 "0:3: \"e\" must be followed by exponent.\n" },
908 { "1.2.3 foo", true,
909 "0:3: Already saw decimal point or exponent; can't have another one.\n" },
910 { "1e2.3 foo", true,
911 "0:3: Already saw decimal point or exponent; can't have another one.\n" },
912 { "a.1 foo", true,
913 "0:1: Need space between identifier and decimal point.\n" },
914 // allow_f_after_float not enabled, so this should be an error.
915 { "1.0f foo", true,
916 "0:3: Need space between number and identifier.\n" },
917
918 // Block comment errors.
919 { "/*", false,
920 "0:2: End-of-file inside block comment.\n"
921 "0:0: Comment started here.\n"},
922 { "/*/*/ foo", true,
923 "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
924
925 // Control characters. Multiple consecutive control characters should only
926 // produce one error.
927 { "\b foo", true,
928 "0:0: Invalid control characters encountered in text.\n" },
929 { "\b\b foo", true,
930 "0:0: Invalid control characters encountered in text.\n" },
931
932 // Check that control characters at end of input don't result in an
933 // infinite loop.
934 { "\b", false,
935 "0:0: Invalid control characters encountered in text.\n" },
936
937 // Check recovery from '\0'. We have to explicitly specify the length of
938 // these strings because otherwise the string constructor will just call
939 // strlen() which will see the first '\0' and think that is the end of the
940 // string.
941 { string("\0foo", 4), true,
942 "0:0: Invalid control characters encountered in text.\n" },
943 { string("\0\0foo", 5), true,
944 "0:0: Invalid control characters encountered in text.\n" },
jieluo@google.com4de8f552014-07-18 00:47:59 +0000945
946 // Check error from high order bits set
947 { "\300foo", true,
948 "0:0: Interpreting non ascii codepoint 192.\n" },
temporal40ee5512008-07-10 02:12:20 +0000949};
950
951TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
952 // Set up the tokenizer.
953 TestInputStream input(kErrorCases_case.input.data(),
954 kErrorCases_case.input.size(),
955 kBlockSizes_case);
956 TestErrorCollector error_collector;
957 Tokenizer tokenizer(&input, &error_collector);
958
959 // Ignore all input, except remember if the last token was "foo".
960 bool last_was_foo = false;
961 while (tokenizer.Next()) {
962 last_was_foo = tokenizer.current().text == "foo";
963 }
964
965 // Check that the errors match what was expected.
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000966 EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
temporal40ee5512008-07-10 02:12:20 +0000967
968 // If the error was recoverable, make sure we saw "foo" after it.
969 if (kErrorCases_case.recoverable) {
970 EXPECT_TRUE(last_was_foo);
971 }
972}
973
974// -------------------------------------------------------------------
975
976TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
977 string text = "foo bar";
978 TestInputStream input(text.data(), text.size(), kBlockSizes_case);
979
980 // Create a tokenizer, read one token, then destroy it.
981 {
982 TestErrorCollector error_collector;
983 Tokenizer tokenizer(&input, &error_collector);
984
985 tokenizer.Next();
986 }
987
988 // Only "foo" should have been read.
989 EXPECT_EQ(strlen("foo"), input.ByteCount());
990}
991
xiaofeng@google.comb55a20f2012-09-22 02:40:50 +0000992
temporal40ee5512008-07-10 02:12:20 +0000993} // namespace
994} // namespace io
995} // namespace protobuf
996} // namespace google