blob: 106d080ff7ce2a348eb3ca800cfd7bad9abdfb33 [file] [log] [blame]
temporal40ee5512008-07-10 02:12:20 +00001// Protocol Buffers - Google's data interchange format
kenton@google.com24bf56f2008-09-24 20:31:01 +00002// Copyright 2008 Google Inc. All rights reserved.
temporal40ee5512008-07-10 02:12:20 +00003// http://code.google.com/p/protobuf/
4//
kenton@google.com24bf56f2008-09-24 20:31:01 +00005// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
temporal40ee5512008-07-10 02:12:20 +00008//
kenton@google.com24bf56f2008-09-24 20:31:01 +00009// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
temporal40ee5512008-07-10 02:12:20 +000018//
kenton@google.com24bf56f2008-09-24 20:31:01 +000019// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
temporal40ee5512008-07-10 02:12:20 +000030
31// Author: kenton@google.com (Kenton Varda)
32// Based on original Protocol Buffers design by
33// Sanjay Ghemawat, Jeff Dean, and others.
34
35#include <vector>
36#include <math.h>
37#include <limits.h>
38
39#include <google/protobuf/io/tokenizer.h>
40#include <google/protobuf/io/zero_copy_stream_impl.h>
41
42#include <google/protobuf/stubs/common.h>
43#include <google/protobuf/stubs/strutil.h>
44#include <google/protobuf/stubs/substitute.h>
45#include <google/protobuf/testing/googletest.h>
46#include <gtest/gtest.h>
47
48namespace google {
49namespace protobuf {
50namespace io {
51namespace {
52
53// ===================================================================
54// Data-Driven Test Infrastructure
55
56// TODO(kenton): This is copied from coded_stream_unittest. This is
57// temporary until these fetaures are integrated into gTest itself.
58
59// TEST_1D and TEST_2D are macros I'd eventually like to see added to
60// gTest. These macros can be used to declare tests which should be
61// run multiple times, once for each item in some input array. TEST_1D
62// tests all cases in a single input array. TEST_2D tests all
63// combinations of cases from two arrays. The arrays must be statically
64// defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
65//
66// int kCases[] = {1, 2, 3, 4}
67// TEST_1D(MyFixture, MyTest, kCases) {
68// EXPECT_GT(kCases_case, 0);
69// }
70//
71// This test iterates through the numbers 1, 2, 3, and 4 and tests that
72// they are all grater than zero. In case of failure, the exact case
73// which failed will be printed. The case type must be printable using
74// ostream::operator<<.
75
76#define TEST_1D(FIXTURE, NAME, CASES) \
77 class FIXTURE##_##NAME##_DD : public FIXTURE { \
78 protected: \
79 template <typename CaseType> \
80 void DoSingleCase(const CaseType& CASES##_case); \
81 }; \
82 \
83 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
84 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
85 SCOPED_TRACE(testing::Message() \
86 << #CASES " case #" << i << ": " << CASES[i]); \
87 DoSingleCase(CASES[i]); \
88 } \
89 } \
90 \
91 template <typename CaseType> \
92 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
93
94#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
95 class FIXTURE##_##NAME##_DD : public FIXTURE { \
96 protected: \
97 template <typename CaseType1, typename CaseType2> \
98 void DoSingleCase(const CaseType1& CASES1##_case, \
99 const CaseType2& CASES2##_case); \
100 }; \
101 \
102 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
103 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
104 for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
105 SCOPED_TRACE(testing::Message() \
106 << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
107 << #CASES2 " case #" << j << ": " << CASES2[j]); \
108 DoSingleCase(CASES1[i], CASES2[j]); \
109 } \
110 } \
111 } \
112 \
113 template <typename CaseType1, typename CaseType2> \
114 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
115 const CaseType2& CASES2##_case)
116
117// -------------------------------------------------------------------
118
119// An input stream that is basically like an ArrayInputStream but sometimes
120// returns empty buffers, just to throw us off.
121class TestInputStream : public ZeroCopyInputStream {
122 public:
123 TestInputStream(const void* data, int size, int block_size)
124 : array_stream_(data, size, block_size), counter_(0) {}
125 ~TestInputStream() {}
126
127 // implements ZeroCopyInputStream ----------------------------------
128 bool Next(const void** data, int* size) {
129 // We'll return empty buffers starting with the first buffer, and every
130 // 3 and 5 buffers after that.
131 if (counter_ % 3 == 0 || counter_ % 5 == 0) {
132 *data = NULL;
133 *size = 0;
134 ++counter_;
135 return true;
136 } else {
137 ++counter_;
138 return array_stream_.Next(data, size);
139 }
140 }
141
142 void BackUp(int count) { return array_stream_.BackUp(count); }
143 bool Skip(int count) { return array_stream_.Skip(count); }
144 int64 ByteCount() const { return array_stream_.ByteCount(); }
145
146 private:
147 ArrayInputStream array_stream_;
148 int counter_;
149};
150
151// -------------------------------------------------------------------
152
153// An error collector which simply concatenates all its errors into a big
154// block of text which can be checked.
155class TestErrorCollector : public ErrorCollector {
156 public:
157 TestErrorCollector() {}
158 ~TestErrorCollector() {}
159
160 string text_;
161
162 // implements ErrorCollector ---------------------------------------
163 void AddError(int line, int column, const string& message) {
164 strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
165 line, column, message);
166 }
167};
168
169// -------------------------------------------------------------------
170
171// We test each operation over a variety of block sizes to insure that
172// we test cases where reads cross buffer boundaries as well as cases
173// where they don't. This is sort of a brute-force approach to this,
174// but it's easy to write and easy to understand.
175const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
176
177class TokenizerTest : public testing::Test {
178 protected:
179 // For easy testing.
180 uint64 ParseInteger(const string& text) {
181 uint64 result;
182 EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
183 return result;
184 }
185};
186
187// ===================================================================
188
189// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
190// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
191#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
192
193// In each test case, the entire input text should parse as a single token
194// of the given type.
195struct SimpleTokenCase {
196 string input;
197 Tokenizer::TokenType type;
198};
199
200inline ostream& operator<<(ostream& out,
201 const SimpleTokenCase& test_case) {
202 return out << CEscape(test_case.input);
203}
204
205SimpleTokenCase kSimpleTokenCases[] = {
206 // Test identifiers.
207 { "hello", Tokenizer::TYPE_IDENTIFIER },
208
209 // Test integers.
210 { "123", Tokenizer::TYPE_INTEGER },
211 { "0xab6", Tokenizer::TYPE_INTEGER },
212 { "0XAB6", Tokenizer::TYPE_INTEGER },
213 { "0X1234567", Tokenizer::TYPE_INTEGER },
214 { "0x89abcdef", Tokenizer::TYPE_INTEGER },
215 { "0x89ABCDEF", Tokenizer::TYPE_INTEGER },
216 { "01234567", Tokenizer::TYPE_INTEGER },
217
218 // Test floats.
219 { "123.45", Tokenizer::TYPE_FLOAT },
220 { "1.", Tokenizer::TYPE_FLOAT },
221 { "1e3", Tokenizer::TYPE_FLOAT },
222 { "1E3", Tokenizer::TYPE_FLOAT },
223 { "1e-3", Tokenizer::TYPE_FLOAT },
224 { "1e+3", Tokenizer::TYPE_FLOAT },
225 { "1.e3", Tokenizer::TYPE_FLOAT },
226 { "1.2e3", Tokenizer::TYPE_FLOAT },
227 { ".1", Tokenizer::TYPE_FLOAT },
228 { ".1e3", Tokenizer::TYPE_FLOAT },
229 { ".1e-3", Tokenizer::TYPE_FLOAT },
230 { ".1e+3", Tokenizer::TYPE_FLOAT },
231
232 // Test strings.
233 { "'hello'", Tokenizer::TYPE_STRING },
234 { "\"foo\"", Tokenizer::TYPE_STRING },
235 { "'a\"b'", Tokenizer::TYPE_STRING },
236 { "\"a'b\"", Tokenizer::TYPE_STRING },
237 { "'a\\'b'", Tokenizer::TYPE_STRING },
238 { "\"a\\\"b\"", Tokenizer::TYPE_STRING },
239 { "'\\xf'", Tokenizer::TYPE_STRING },
240 { "'\\0'", Tokenizer::TYPE_STRING },
241
242 // Test symbols.
243 { "+", Tokenizer::TYPE_SYMBOL },
244 { ".", Tokenizer::TYPE_SYMBOL },
245};
246
247TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
248 // Set up the tokenizer.
249 TestInputStream input(kSimpleTokenCases_case.input.data(),
250 kSimpleTokenCases_case.input.size(),
251 kBlockSizes_case);
252 TestErrorCollector error_collector;
253 Tokenizer tokenizer(&input, &error_collector);
254
255 // Before Next() is called, the initial token should always be TYPE_START.
256 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
257 EXPECT_EQ("", tokenizer.current().text);
258 EXPECT_EQ(0, tokenizer.current().line);
259 EXPECT_EQ(0, tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000260 EXPECT_EQ(0, tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000261
262 // Parse the token.
263 ASSERT_TRUE(tokenizer.Next());
264
265 // Check that it has the right type.
266 EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
267 // Check that it contains the complete input text.
268 EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
269 // Check that it is located at the beginning of the input
270 EXPECT_EQ(0, tokenizer.current().line);
271 EXPECT_EQ(0, tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000272 EXPECT_EQ(kSimpleTokenCases_case.input.size(),
273 tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000274
275 // There should be no more input.
276 EXPECT_FALSE(tokenizer.Next());
277
278 // After Next() returns false, the token should have type TYPE_END.
279 EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
280 EXPECT_EQ("", tokenizer.current().text);
281 EXPECT_EQ(0, tokenizer.current().line);
282 EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000283 EXPECT_EQ(kSimpleTokenCases_case.input.size(),
284 tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000285
286 // There should be no errors.
287 EXPECT_TRUE(error_collector.text_.empty());
288}
289
290TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
291 // Test the "allow_f_after_float" option.
292
293 // Set up the tokenizer.
294 const char* text = "1f 2.5f 6e3f 7F";
295 TestInputStream input(text, strlen(text), kBlockSizes_case);
296 TestErrorCollector error_collector;
297 Tokenizer tokenizer(&input, &error_collector);
298 tokenizer.set_allow_f_after_float(true);
299
300 // Advance through tokens and check that they are parsed as expected.
301 ASSERT_TRUE(tokenizer.Next());
302 EXPECT_EQ(tokenizer.current().text, "1f");
303 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
304 ASSERT_TRUE(tokenizer.Next());
305 EXPECT_EQ(tokenizer.current().text, "2.5f");
306 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
307 ASSERT_TRUE(tokenizer.Next());
308 EXPECT_EQ(tokenizer.current().text, "6e3f");
309 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
310 ASSERT_TRUE(tokenizer.Next());
311 EXPECT_EQ(tokenizer.current().text, "7F");
312 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
313
314 // There should be no more input.
315 EXPECT_FALSE(tokenizer.Next());
316 // There should be no errors.
317 EXPECT_TRUE(error_collector.text_.empty());
318}
319
320#endif
321
322// -------------------------------------------------------------------
323
324// In each case, the input is parsed to produce a list of tokens. The
325// last token in "output" must have type TYPE_END.
326struct MultiTokenCase {
327 string input;
328 Tokenizer::Token output[10]; // The compiler wants a constant array
329 // size for initialization to work. There
330 // is no reason this can't be increased if
331 // needed.
332};
333
334inline ostream& operator<<(ostream& out,
335 const MultiTokenCase& test_case) {
336 return out << CEscape(test_case.input);
337}
338
339MultiTokenCase kMultiTokenCases[] = {
340 // Test empty input.
341 { "", {
342 { Tokenizer::TYPE_END , "" , 0, 0 },
343 }},
344
345 // Test all token types at the same time.
346 { "foo 1 1.2 + 'bar'", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000347 { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 },
348 { Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 },
349 { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 },
350 { Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 },
351 { Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 },
352 { Tokenizer::TYPE_END , "" , 0, 17, 17 },
temporal40ee5512008-07-10 02:12:20 +0000353 }},
354
355 // Test that consecutive symbols are parsed as separate tokens.
356 { "!@+%", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000357 { Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 },
358 { Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 },
359 { Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 },
360 { Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 },
361 { Tokenizer::TYPE_END , "" , 0, 4, 4 },
temporal40ee5512008-07-10 02:12:20 +0000362 }},
363
364 // Test that newlines affect line numbers correctly.
365 { "foo bar\nrab oof", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000366 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
367 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 },
368 { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 },
369 { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 },
370 { Tokenizer::TYPE_END , "" , 1, 7, 7 },
temporal40ee5512008-07-10 02:12:20 +0000371 }},
372
373 // Test that tabs affect column numbers correctly.
374 { "foo\tbar \tbaz", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000375 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
376 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 },
377 { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
378 { Tokenizer::TYPE_END , "" , 0, 19, 19 },
379 }},
380
381 // Test that tabs in string literals affect column numbers correctly.
382 { "\"foo\tbar\" baz", {
383 { Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 },
384 { Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 },
385 { Tokenizer::TYPE_END , "" , 0, 16, 16 },
temporal40ee5512008-07-10 02:12:20 +0000386 }},
387
388 // Test that line comments are ignored.
389 { "foo // This is a comment\n"
390 "bar // This is another comment", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000391 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
392 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 },
393 { Tokenizer::TYPE_END , "" , 1, 30, 30 },
temporal40ee5512008-07-10 02:12:20 +0000394 }},
395
396 // Test that block comments are ignored.
397 { "foo /* This is a block comment */ bar", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000398 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
399 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
400 { Tokenizer::TYPE_END , "" , 0, 37, 37 },
temporal40ee5512008-07-10 02:12:20 +0000401 }},
402
403 // Test that sh-style comments are not ignored by default.
404 { "foo # bar\n"
405 "baz", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000406 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
407 { Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 },
408 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
409 { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
410 { Tokenizer::TYPE_END , "" , 1, 3, 3 },
temporal40ee5512008-07-10 02:12:20 +0000411 }},
kenton@google.comfccb1462009-12-18 02:11:36 +0000412
413 // Bytes with the high-order bit set should not be seen as control characters.
414 { "\300", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000415 { Tokenizer::TYPE_SYMBOL, "\300", 0, 0, 1 },
416 { Tokenizer::TYPE_END , "" , 0, 1, 1 },
kenton@google.comfccb1462009-12-18 02:11:36 +0000417 }},
kenton@google.com6f12e3e2009-12-22 18:11:09 +0000418
419 // Test all whitespace chars
420 { "foo\n\t\r\v\fbar", {
liujisi@google.com33165fe2010-11-02 13:14:58 +0000421 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
422 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
423 { Tokenizer::TYPE_END , "" , 1, 14, 14 },
kenton@google.com6f12e3e2009-12-22 18:11:09 +0000424 }},
temporal40ee5512008-07-10 02:12:20 +0000425};
426
427TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
428 // Set up the tokenizer.
429 TestInputStream input(kMultiTokenCases_case.input.data(),
430 kMultiTokenCases_case.input.size(),
431 kBlockSizes_case);
432 TestErrorCollector error_collector;
433 Tokenizer tokenizer(&input, &error_collector);
434
435 // Before Next() is called, the initial token should always be TYPE_START.
436 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
437 EXPECT_EQ("", tokenizer.current().text);
438 EXPECT_EQ(0, tokenizer.current().line);
439 EXPECT_EQ(0, tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000440 EXPECT_EQ(0, tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000441
442 // Loop through all expected tokens.
443 int i = 0;
444 Tokenizer::Token token;
445 do {
446 token = kMultiTokenCases_case.output[i++];
447
448 SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
449
liujisi@google.com33165fe2010-11-02 13:14:58 +0000450 Tokenizer::Token previous = tokenizer.current();
451
temporal40ee5512008-07-10 02:12:20 +0000452 // Next() should only return false when it hits the end token.
453 if (token.type != Tokenizer::TYPE_END) {
454 ASSERT_TRUE(tokenizer.Next());
455 } else {
456 ASSERT_FALSE(tokenizer.Next());
457 }
458
liujisi@google.com33165fe2010-11-02 13:14:58 +0000459 // Check that the previous token is set correctly.
460 EXPECT_EQ(previous.type, tokenizer.previous().type);
461 EXPECT_EQ(previous.text, tokenizer.previous().text);
462 EXPECT_EQ(previous.line, tokenizer.previous().line);
463 EXPECT_EQ(previous.column, tokenizer.previous().column);
464 EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
465
temporal40ee5512008-07-10 02:12:20 +0000466 // Check that the token matches the expected one.
467 EXPECT_EQ(token.type, tokenizer.current().type);
468 EXPECT_EQ(token.text, tokenizer.current().text);
469 EXPECT_EQ(token.line, tokenizer.current().line);
470 EXPECT_EQ(token.column, tokenizer.current().column);
liujisi@google.com33165fe2010-11-02 13:14:58 +0000471 EXPECT_EQ(token.end_column, tokenizer.current().end_column);
temporal40ee5512008-07-10 02:12:20 +0000472
473 } while (token.type != Tokenizer::TYPE_END);
474
475 // There should be no errors.
476 EXPECT_TRUE(error_collector.text_.empty());
477}
478
479// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
480// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
481#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
482
483TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
484 // Test the "comment_style" option.
485
486 const char* text = "foo # bar\n"
487 "baz // qux\n"
488 "corge /* grault */\n"
489 "garply";
490 const char* const kTokens[] = {"foo", // "# bar" is ignored
491 "baz", "/", "/", "qux",
492 "corge", "/", "*", "grault", "*", "/",
493 "garply"};
494
495 // Set up the tokenizer.
496 TestInputStream input(text, strlen(text), kBlockSizes_case);
497 TestErrorCollector error_collector;
498 Tokenizer tokenizer(&input, &error_collector);
499 tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
500
501 // Advance through tokens and check that they are parsed as expected.
502 for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
503 EXPECT_TRUE(tokenizer.Next());
504 EXPECT_EQ(tokenizer.current().text, kTokens[i]);
505 }
506
507 // There should be no more input.
508 EXPECT_FALSE(tokenizer.Next());
509 // There should be no errors.
510 EXPECT_TRUE(error_collector.text_.empty());
511}
512
513#endif
514
515// -------------------------------------------------------------------
516
517// Test parse helpers. It's not really worth setting up a full data-driven
518// test here.
519TEST_F(TokenizerTest, ParseInteger) {
520 EXPECT_EQ(0, ParseInteger("0"));
521 EXPECT_EQ(123, ParseInteger("123"));
522 EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
523 EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
524 EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
525 EXPECT_EQ(01234567, ParseInteger("01234567"));
kenton@google.com80b1d622009-07-29 01:13:20 +0000526 EXPECT_EQ(0X123, ParseInteger("0X123"));
temporal40ee5512008-07-10 02:12:20 +0000527
528 // Test invalid integers that may still be tokenized as integers.
529 EXPECT_EQ(0, ParseInteger("0x"));
530
temporalf2063512008-07-23 01:19:07 +0000531 uint64 i;
temporal40ee5512008-07-10 02:12:20 +0000532#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
533 // Test invalid integers that will never be tokenized as integers.
temporalf2063512008-07-23 01:19:07 +0000534 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
temporal40ee5512008-07-10 02:12:20 +0000535 "passed text that could not have been tokenized as an integer");
temporalf2063512008-07-23 01:19:07 +0000536 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
temporal40ee5512008-07-10 02:12:20 +0000537 "passed text that could not have been tokenized as an integer");
temporalf2063512008-07-23 01:19:07 +0000538 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
temporal40ee5512008-07-10 02:12:20 +0000539 "passed text that could not have been tokenized as an integer");
temporalf2063512008-07-23 01:19:07 +0000540 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
temporal40ee5512008-07-10 02:12:20 +0000541 "passed text that could not have been tokenized as an integer");
temporalf2063512008-07-23 01:19:07 +0000542 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
temporal40ee5512008-07-10 02:12:20 +0000543 "passed text that could not have been tokenized as an integer");
544#endif // GTEST_HAS_DEATH_TEST
545
546 // Test overflows.
temporal40ee5512008-07-10 02:12:20 +0000547 EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
548 EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
549 EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
550 EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
551 EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
552 EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
553 EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
554}
555
556TEST_F(TokenizerTest, ParseFloat) {
557 EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1."));
558 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3"));
559 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3"));
560 EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
561 EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1"));
562 EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25"));
563 EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
564 EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
565 EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
566 EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
567 EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5"));
568 EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
569 EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2"));
570 EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
571
572 // Test invalid integers that may still be tokenized as integers.
573 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
574 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
575 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
576
577 // Test 'f' suffix.
578 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
579 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
580 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
581
582 // These should parse successfully even though they are out of range.
583 // Overflows become infinity and underflows become zero.
584 EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
585 EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
586
587#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
588 // Test invalid integers that will never be tokenized as integers.
589 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
590 "passed text that could not have been tokenized as a float");
591 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
592 "passed text that could not have been tokenized as a float");
593 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
594 "passed text that could not have been tokenized as a float");
595#endif // GTEST_HAS_DEATH_TEST
596}
597
598TEST_F(TokenizerTest, ParseString) {
599 string output;
600 Tokenizer::ParseString("'hello'", &output);
601 EXPECT_EQ("hello", output);
602 Tokenizer::ParseString("\"blah\\nblah2\"", &output);
603 EXPECT_EQ("blah\nblah2", output);
604 Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
605 EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
606 Tokenizer::ParseString("'\\x20\\x4'", &output);
607 EXPECT_EQ("\x20\x4", output);
608
609 // Test invalid strings that may still be tokenized as strings.
610 Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
611 EXPECT_EQ("\a?\v\t", output);
612 Tokenizer::ParseString("'", &output);
613 EXPECT_EQ("", output);
614 Tokenizer::ParseString("'\\", &output);
615 EXPECT_EQ("\\", output);
616
617 // Test invalid strings that will never be tokenized as strings.
618#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
619 EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
620 "passed text that could not have been tokenized as a string");
621#endif // GTEST_HAS_DEATH_TEST
622}
623
kenton@google.com26bd9ee2008-11-21 00:06:27 +0000624TEST_F(TokenizerTest, ParseStringAppend) {
625 // Check that ParseString and ParseStringAppend differ.
626 string output("stuff+");
627 Tokenizer::ParseStringAppend("'hello'", &output);
628 EXPECT_EQ("stuff+hello", output);
629 Tokenizer::ParseString("'hello'", &output);
630 EXPECT_EQ("hello", output);
631}
632
temporal40ee5512008-07-10 02:12:20 +0000633// -------------------------------------------------------------------
634
635// Each case parses some input text, ignoring the tokens produced, and
636// checks that the error output matches what is expected.
637struct ErrorCase {
638 string input;
639 bool recoverable; // True if the tokenizer should be able to recover and
640 // parse more tokens after seeing this error. Cases
641 // for which this is true must end with "foo" as
642 // the last token, which the test will check for.
643 const char* errors;
644};
645
646inline ostream& operator<<(ostream& out,
647 const ErrorCase& test_case) {
648 return out << CEscape(test_case.input);
649}
650
651ErrorCase kErrorCases[] = {
652 // String errors.
653 { "'\\l' foo", true,
654 "0:2: Invalid escape sequence in string literal.\n" },
655 { "'\\x' foo", true,
656 "0:3: Expected hex digits for escape sequence.\n" },
657 { "'foo", false,
658 "0:4: String literals cannot cross line boundaries.\n" },
659 { "'bar\nfoo", true,
660 "0:4: String literals cannot cross line boundaries.\n" },
661
662 // Integer errors.
663 { "123foo", true,
664 "0:3: Need space between number and identifier.\n" },
665
666 // Hex/octal errors.
667 { "0x foo", true,
668 "0:2: \"0x\" must be followed by hex digits.\n" },
669 { "0541823 foo", true,
670 "0:4: Numbers starting with leading zero must be in octal.\n" },
671 { "0x123z foo", true,
672 "0:5: Need space between number and identifier.\n" },
673 { "0x123.4 foo", true,
674 "0:5: Hex and octal numbers must be integers.\n" },
675 { "0123.4 foo", true,
676 "0:4: Hex and octal numbers must be integers.\n" },
677
678 // Float errors.
679 { "1e foo", true,
680 "0:2: \"e\" must be followed by exponent.\n" },
681 { "1e- foo", true,
682 "0:3: \"e\" must be followed by exponent.\n" },
683 { "1.2.3 foo", true,
684 "0:3: Already saw decimal point or exponent; can't have another one.\n" },
685 { "1e2.3 foo", true,
686 "0:3: Already saw decimal point or exponent; can't have another one.\n" },
687 { "a.1 foo", true,
688 "0:1: Need space between identifier and decimal point.\n" },
689 // allow_f_after_float not enabled, so this should be an error.
690 { "1.0f foo", true,
691 "0:3: Need space between number and identifier.\n" },
692
693 // Block comment errors.
694 { "/*", false,
695 "0:2: End-of-file inside block comment.\n"
696 "0:0: Comment started here.\n"},
697 { "/*/*/ foo", true,
698 "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
699
700 // Control characters. Multiple consecutive control characters should only
701 // produce one error.
702 { "\b foo", true,
703 "0:0: Invalid control characters encountered in text.\n" },
704 { "\b\b foo", true,
705 "0:0: Invalid control characters encountered in text.\n" },
706
707 // Check that control characters at end of input don't result in an
708 // infinite loop.
709 { "\b", false,
710 "0:0: Invalid control characters encountered in text.\n" },
711
712 // Check recovery from '\0'. We have to explicitly specify the length of
713 // these strings because otherwise the string constructor will just call
714 // strlen() which will see the first '\0' and think that is the end of the
715 // string.
716 { string("\0foo", 4), true,
717 "0:0: Invalid control characters encountered in text.\n" },
718 { string("\0\0foo", 5), true,
719 "0:0: Invalid control characters encountered in text.\n" },
720};
721
722TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
723 // Set up the tokenizer.
724 TestInputStream input(kErrorCases_case.input.data(),
725 kErrorCases_case.input.size(),
726 kBlockSizes_case);
727 TestErrorCollector error_collector;
728 Tokenizer tokenizer(&input, &error_collector);
729
730 // Ignore all input, except remember if the last token was "foo".
731 bool last_was_foo = false;
732 while (tokenizer.Next()) {
733 last_was_foo = tokenizer.current().text == "foo";
734 }
735
736 // Check that the errors match what was expected.
737 EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
738
739 // If the error was recoverable, make sure we saw "foo" after it.
740 if (kErrorCases_case.recoverable) {
741 EXPECT_TRUE(last_was_foo);
742 }
743}
744
745// -------------------------------------------------------------------
746
747TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
748 string text = "foo bar";
749 TestInputStream input(text.data(), text.size(), kBlockSizes_case);
750
751 // Create a tokenizer, read one token, then destroy it.
752 {
753 TestErrorCollector error_collector;
754 Tokenizer tokenizer(&input, &error_collector);
755
756 tokenizer.Next();
757 }
758
759 // Only "foo" should have been read.
760 EXPECT_EQ(strlen("foo"), input.ByteCount());
761}
762
763} // namespace
764} // namespace io
765} // namespace protobuf
766} // namespace google