Blame - src/google/protobuf/io/tokenizer_unittest.cc - platform/external/protobuf-javalite

blob: 106d080ff7ce2a348eb3ca800cfd7bad9abdfb33 [file] [log] [blame]

temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	1	// Protocol Buffers - Google's data interchange format
kenton@google.com	24bf56f	2008-09-24 20:31:01 +0000	[diff] [blame]	2	// Copyright 2008 Google Inc. All rights reserved.
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	3	// http://code.google.com/p/protobuf/
				4	//
kenton@google.com	24bf56f	2008-09-24 20:31:01 +0000	[diff] [blame]	5	// Redistribution and use in source and binary forms, with or without
				6	// modification, are permitted provided that the following conditions are
				7	// met:
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	8	//
kenton@google.com	24bf56f	2008-09-24 20:31:01 +0000	[diff] [blame]	9	// * Redistributions of source code must retain the above copyright
				10	// notice, this list of conditions and the following disclaimer.
				11	// * Redistributions in binary form must reproduce the above
				12	// copyright notice, this list of conditions and the following disclaimer
				13	// in the documentation and/or other materials provided with the
				14	// distribution.
				15	// * Neither the name of Google Inc. nor the names of its
				16	// contributors may be used to endorse or promote products derived from
				17	// this software without specific prior written permission.
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	18	//
kenton@google.com	24bf56f	2008-09-24 20:31:01 +0000	[diff] [blame]	19	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	30
				31	// Author: kenton@google.com (Kenton Varda)
				32	// Based on original Protocol Buffers design by
				33	// Sanjay Ghemawat, Jeff Dean, and others.
				34
				35	#include <vector>
				36	#include <math.h>
				37	#include <limits.h>
				38
				39	#include <google/protobuf/io/tokenizer.h>
				40	#include <google/protobuf/io/zero_copy_stream_impl.h>
				41
				42	#include <google/protobuf/stubs/common.h>
				43	#include <google/protobuf/stubs/strutil.h>
				44	#include <google/protobuf/stubs/substitute.h>
				45	#include <google/protobuf/testing/googletest.h>
				46	#include <gtest/gtest.h>
				47
				48	namespace google {
				49	namespace protobuf {
				50	namespace io {
				51	namespace {
				52
				53	// ===================================================================
				54	// Data-Driven Test Infrastructure
				55
				56	// TODO(kenton): This is copied from coded_stream_unittest. This is
				57	// temporary until these fetaures are integrated into gTest itself.
				58
				59	// TEST_1D and TEST_2D are macros I'd eventually like to see added to
				60	// gTest. These macros can be used to declare tests which should be
				61	// run multiple times, once for each item in some input array. TEST_1D
				62	// tests all cases in a single input array. TEST_2D tests all
				63	// combinations of cases from two arrays. The arrays must be statically
				64	// defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
				65	//
				66	// int kCases[] = {1, 2, 3, 4}
				67	// TEST_1D(MyFixture, MyTest, kCases) {
				68	// EXPECT_GT(kCases_case, 0);
				69	// }
				70	//
				71	// This test iterates through the numbers 1, 2, 3, and 4 and tests that
				72	// they are all grater than zero. In case of failure, the exact case
				73	// which failed will be printed. The case type must be printable using
				74	// ostream::operator<<.
				75
				76	#define TEST_1D(FIXTURE, NAME, CASES) \
				77	class FIXTURE##_##NAME##_DD : public FIXTURE { \
				78	protected: \
				79	template <typename CaseType> \
				80	void DoSingleCase(const CaseType& CASES##_case); \
				81	}; \
				82	\
				83	TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
				84	for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
				85	SCOPED_TRACE(testing::Message() \
				86	<< #CASES " case #" << i << ": " << CASES[i]); \
				87	DoSingleCase(CASES[i]); \
				88	} \
				89	} \
				90	\
				91	template <typename CaseType> \
				92	void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
				93
				94	#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
				95	class FIXTURE##_##NAME##_DD : public FIXTURE { \
				96	protected: \
				97	template <typename CaseType1, typename CaseType2> \
				98	void DoSingleCase(const CaseType1& CASES1##_case, \
				99	const CaseType2& CASES2##_case); \
				100	}; \
				101	\
				102	TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
				103	for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
				104	for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
				105	SCOPED_TRACE(testing::Message() \
				106	<< #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
				107	<< #CASES2 " case #" << j << ": " << CASES2[j]); \
				108	DoSingleCase(CASES1[i], CASES2[j]); \
				109	} \
				110	} \
				111	} \
				112	\
				113	template <typename CaseType1, typename CaseType2> \
				114	void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
				115	const CaseType2& CASES2##_case)
				116
				117	// -------------------------------------------------------------------
				118
				119	// An input stream that is basically like an ArrayInputStream but sometimes
				120	// returns empty buffers, just to throw us off.
				121	class TestInputStream : public ZeroCopyInputStream {
				122	public:
				123	TestInputStream(const void* data, int size, int block_size)
				124	: array_stream_(data, size, block_size), counter_(0) {}
				125	~TestInputStream() {}
				126
				127	// implements ZeroCopyInputStream ----------------------------------
				128	bool Next(const void** data, int* size) {
				129	// We'll return empty buffers starting with the first buffer, and every
				130	// 3 and 5 buffers after that.
				131	if (counter_ % 3 == 0 \|\| counter_ % 5 == 0) {
				132	*data = NULL;
				133	*size = 0;
				134	++counter_;
				135	return true;
				136	} else {
				137	++counter_;
				138	return array_stream_.Next(data, size);
				139	}
				140	}
				141
				142	void BackUp(int count) { return array_stream_.BackUp(count); }
				143	bool Skip(int count) { return array_stream_.Skip(count); }
				144	int64 ByteCount() const { return array_stream_.ByteCount(); }
				145
				146	private:
				147	ArrayInputStream array_stream_;
				148	int counter_;
				149	};
				150
				151	// -------------------------------------------------------------------
				152
				153	// An error collector which simply concatenates all its errors into a big
				154	// block of text which can be checked.
				155	class TestErrorCollector : public ErrorCollector {
				156	public:
				157	TestErrorCollector() {}
				158	~TestErrorCollector() {}
				159
				160	string text_;
				161
				162	// implements ErrorCollector ---------------------------------------
				163	void AddError(int line, int column, const string& message) {
				164	strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
				165	line, column, message);
				166	}
				167	};
				168
				169	// -------------------------------------------------------------------
				170
				171	// We test each operation over a variety of block sizes to insure that
				172	// we test cases where reads cross buffer boundaries as well as cases
				173	// where they don't. This is sort of a brute-force approach to this,
				174	// but it's easy to write and easy to understand.
				175	const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
				176
				177	class TokenizerTest : public testing::Test {
				178	protected:
				179	// For easy testing.
				180	uint64 ParseInteger(const string& text) {
				181	uint64 result;
				182	EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
				183	return result;
				184	}
				185	};
				186
				187	// ===================================================================
				188
				189	// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
				190	// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
				191	#if !defined(__GNUC__) \|\| __GNUC__ > 3 \|\| (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
				192
				193	// In each test case, the entire input text should parse as a single token
				194	// of the given type.
				195	struct SimpleTokenCase {
				196	string input;
				197	Tokenizer::TokenType type;
				198	};
				199
				200	inline ostream& operator<<(ostream& out,
				201	const SimpleTokenCase& test_case) {
				202	return out << CEscape(test_case.input);
				203	}
				204
				205	SimpleTokenCase kSimpleTokenCases[] = {
				206	// Test identifiers.
				207	{ "hello", Tokenizer::TYPE_IDENTIFIER },
				208
				209	// Test integers.
				210	{ "123", Tokenizer::TYPE_INTEGER },
				211	{ "0xab6", Tokenizer::TYPE_INTEGER },
				212	{ "0XAB6", Tokenizer::TYPE_INTEGER },
				213	{ "0X1234567", Tokenizer::TYPE_INTEGER },
				214	{ "0x89abcdef", Tokenizer::TYPE_INTEGER },
				215	{ "0x89ABCDEF", Tokenizer::TYPE_INTEGER },
				216	{ "01234567", Tokenizer::TYPE_INTEGER },
				217
				218	// Test floats.
				219	{ "123.45", Tokenizer::TYPE_FLOAT },
				220	{ "1.", Tokenizer::TYPE_FLOAT },
				221	{ "1e3", Tokenizer::TYPE_FLOAT },
				222	{ "1E3", Tokenizer::TYPE_FLOAT },
				223	{ "1e-3", Tokenizer::TYPE_FLOAT },
				224	{ "1e+3", Tokenizer::TYPE_FLOAT },
				225	{ "1.e3", Tokenizer::TYPE_FLOAT },
				226	{ "1.2e3", Tokenizer::TYPE_FLOAT },
				227	{ ".1", Tokenizer::TYPE_FLOAT },
				228	{ ".1e3", Tokenizer::TYPE_FLOAT },
				229	{ ".1e-3", Tokenizer::TYPE_FLOAT },
				230	{ ".1e+3", Tokenizer::TYPE_FLOAT },
				231
				232	// Test strings.
				233	{ "'hello'", Tokenizer::TYPE_STRING },
				234	{ "\"foo\"", Tokenizer::TYPE_STRING },
				235	{ "'a\"b'", Tokenizer::TYPE_STRING },
				236	{ "\"a'b\"", Tokenizer::TYPE_STRING },
				237	{ "'a\\'b'", Tokenizer::TYPE_STRING },
				238	{ "\"a\\\"b\"", Tokenizer::TYPE_STRING },
				239	{ "'\\xf'", Tokenizer::TYPE_STRING },
				240	{ "'\\0'", Tokenizer::TYPE_STRING },
				241
				242	// Test symbols.
				243	{ "+", Tokenizer::TYPE_SYMBOL },
				244	{ ".", Tokenizer::TYPE_SYMBOL },
				245	};
				246
				247	TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
				248	// Set up the tokenizer.
				249	TestInputStream input(kSimpleTokenCases_case.input.data(),
				250	kSimpleTokenCases_case.input.size(),
				251	kBlockSizes_case);
				252	TestErrorCollector error_collector;
				253	Tokenizer tokenizer(&input, &error_collector);
				254
				255	// Before Next() is called, the initial token should always be TYPE_START.
				256	EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
				257	EXPECT_EQ("", tokenizer.current().text);
				258	EXPECT_EQ(0, tokenizer.current().line);
				259	EXPECT_EQ(0, tokenizer.current().column);
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	260	EXPECT_EQ(0, tokenizer.current().end_column);
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	261
				262	// Parse the token.
				263	ASSERT_TRUE(tokenizer.Next());
				264
				265	// Check that it has the right type.
				266	EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
				267	// Check that it contains the complete input text.
				268	EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
				269	// Check that it is located at the beginning of the input
				270	EXPECT_EQ(0, tokenizer.current().line);
				271	EXPECT_EQ(0, tokenizer.current().column);
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	272	EXPECT_EQ(kSimpleTokenCases_case.input.size(),
				273	tokenizer.current().end_column);
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	274
				275	// There should be no more input.
				276	EXPECT_FALSE(tokenizer.Next());
				277
				278	// After Next() returns false, the token should have type TYPE_END.
				279	EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
				280	EXPECT_EQ("", tokenizer.current().text);
				281	EXPECT_EQ(0, tokenizer.current().line);
				282	EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	283	EXPECT_EQ(kSimpleTokenCases_case.input.size(),
				284	tokenizer.current().end_column);
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	285
				286	// There should be no errors.
				287	EXPECT_TRUE(error_collector.text_.empty());
				288	}
				289
				290	TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
				291	// Test the "allow_f_after_float" option.
				292
				293	// Set up the tokenizer.
				294	const char* text = "1f 2.5f 6e3f 7F";
				295	TestInputStream input(text, strlen(text), kBlockSizes_case);
				296	TestErrorCollector error_collector;
				297	Tokenizer tokenizer(&input, &error_collector);
				298	tokenizer.set_allow_f_after_float(true);
				299
				300	// Advance through tokens and check that they are parsed as expected.
				301	ASSERT_TRUE(tokenizer.Next());
				302	EXPECT_EQ(tokenizer.current().text, "1f");
				303	EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
				304	ASSERT_TRUE(tokenizer.Next());
				305	EXPECT_EQ(tokenizer.current().text, "2.5f");
				306	EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
				307	ASSERT_TRUE(tokenizer.Next());
				308	EXPECT_EQ(tokenizer.current().text, "6e3f");
				309	EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
				310	ASSERT_TRUE(tokenizer.Next());
				311	EXPECT_EQ(tokenizer.current().text, "7F");
				312	EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
				313
				314	// There should be no more input.
				315	EXPECT_FALSE(tokenizer.Next());
				316	// There should be no errors.
				317	EXPECT_TRUE(error_collector.text_.empty());
				318	}
				319
				320	#endif
				321
				322	// -------------------------------------------------------------------
				323
				324	// In each case, the input is parsed to produce a list of tokens. The
				325	// last token in "output" must have type TYPE_END.
				326	struct MultiTokenCase {
				327	string input;
				328	Tokenizer::Token output[10]; // The compiler wants a constant array
				329	// size for initialization to work. There
				330	// is no reason this can't be increased if
				331	// needed.
				332	};
				333
				334	inline ostream& operator<<(ostream& out,
				335	const MultiTokenCase& test_case) {
				336	return out << CEscape(test_case.input);
				337	}
				338
				339	MultiTokenCase kMultiTokenCases[] = {
				340	// Test empty input.
				341	{ "", {
				342	{ Tokenizer::TYPE_END , "" , 0, 0 },
				343	}},
				344
				345	// Test all token types at the same time.
				346	{ "foo 1 1.2 + 'bar'", {
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	347	{ Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 },
				348	{ Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 },
				349	{ Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 },
				350	{ Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 },
				351	{ Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 },
				352	{ Tokenizer::TYPE_END , "" , 0, 17, 17 },
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	353	}},
				354
				355	// Test that consecutive symbols are parsed as separate tokens.
				356	{ "!@+%", {
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	357	{ Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 },
				358	{ Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 },
				359	{ Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 },
				360	{ Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 },
				361	{ Tokenizer::TYPE_END , "" , 0, 4, 4 },
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	362	}},
				363
				364	// Test that newlines affect line numbers correctly.
				365	{ "foo bar\nrab oof", {
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	366	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				367	{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 },
				368	{ Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 },
				369	{ Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 },
				370	{ Tokenizer::TYPE_END , "" , 1, 7, 7 },
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	371	}},
				372
				373	// Test that tabs affect column numbers correctly.
				374	{ "foo\tbar \tbaz", {
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	375	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				376	{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 },
				377	{ Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
				378	{ Tokenizer::TYPE_END , "" , 0, 19, 19 },
				379	}},
				380
				381	// Test that tabs in string literals affect column numbers correctly.
				382	{ "\"foo\tbar\" baz", {
				383	{ Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 },
				384	{ Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 },
				385	{ Tokenizer::TYPE_END , "" , 0, 16, 16 },
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	386	}},
				387
				388	// Test that line comments are ignored.
				389	{ "foo // This is a comment\n"
				390	"bar // This is another comment", {
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	391	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				392	{ Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 },
				393	{ Tokenizer::TYPE_END , "" , 1, 30, 30 },
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	394	}},
				395
				396	// Test that block comments are ignored.
				397	{ "foo /* This is a block comment */ bar", {
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	398	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				399	{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
				400	{ Tokenizer::TYPE_END , "" , 0, 37, 37 },
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	401	}},
				402
				403	// Test that sh-style comments are not ignored by default.
				404	{ "foo # bar\n"
				405	"baz", {
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	406	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				407	{ Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 },
				408	{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
				409	{ Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
				410	{ Tokenizer::TYPE_END , "" , 1, 3, 3 },
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	411	}},
kenton@google.com	fccb146	2009-12-18 02:11:36 +0000	[diff] [blame]	412
				413	// Bytes with the high-order bit set should not be seen as control characters.
				414	{ "\300", {
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	415	{ Tokenizer::TYPE_SYMBOL, "\300", 0, 0, 1 },
				416	{ Tokenizer::TYPE_END , "" , 0, 1, 1 },
kenton@google.com	fccb146	2009-12-18 02:11:36 +0000	[diff] [blame]	417	}},
kenton@google.com	6f12e3e	2009-12-22 18:11:09 +0000	[diff] [blame]	418
				419	// Test all whitespace chars
				420	{ "foo\n\t\r\v\fbar", {
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	421	{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
				422	{ Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
				423	{ Tokenizer::TYPE_END , "" , 1, 14, 14 },
kenton@google.com	6f12e3e	2009-12-22 18:11:09 +0000	[diff] [blame]	424	}},
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	425	};
				426
				427	TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
				428	// Set up the tokenizer.
				429	TestInputStream input(kMultiTokenCases_case.input.data(),
				430	kMultiTokenCases_case.input.size(),
				431	kBlockSizes_case);
				432	TestErrorCollector error_collector;
				433	Tokenizer tokenizer(&input, &error_collector);
				434
				435	// Before Next() is called, the initial token should always be TYPE_START.
				436	EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
				437	EXPECT_EQ("", tokenizer.current().text);
				438	EXPECT_EQ(0, tokenizer.current().line);
				439	EXPECT_EQ(0, tokenizer.current().column);
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	440	EXPECT_EQ(0, tokenizer.current().end_column);
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	441
				442	// Loop through all expected tokens.
				443	int i = 0;
				444	Tokenizer::Token token;
				445	do {
				446	token = kMultiTokenCases_case.output[i++];
				447
				448	SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
				449
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	450	Tokenizer::Token previous = tokenizer.current();
				451
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	452	// Next() should only return false when it hits the end token.
				453	if (token.type != Tokenizer::TYPE_END) {
				454	ASSERT_TRUE(tokenizer.Next());
				455	} else {
				456	ASSERT_FALSE(tokenizer.Next());
				457	}
				458
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	459	// Check that the previous token is set correctly.
				460	EXPECT_EQ(previous.type, tokenizer.previous().type);
				461	EXPECT_EQ(previous.text, tokenizer.previous().text);
				462	EXPECT_EQ(previous.line, tokenizer.previous().line);
				463	EXPECT_EQ(previous.column, tokenizer.previous().column);
				464	EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
				465
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	466	// Check that the token matches the expected one.
				467	EXPECT_EQ(token.type, tokenizer.current().type);
				468	EXPECT_EQ(token.text, tokenizer.current().text);
				469	EXPECT_EQ(token.line, tokenizer.current().line);
				470	EXPECT_EQ(token.column, tokenizer.current().column);
liujisi@google.com	33165fe	2010-11-02 13:14:58 +0000	[diff] [blame^]	471	EXPECT_EQ(token.end_column, tokenizer.current().end_column);
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	472
				473	} while (token.type != Tokenizer::TYPE_END);
				474
				475	// There should be no errors.
				476	EXPECT_TRUE(error_collector.text_.empty());
				477	}
				478
				479	// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
				480	// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
				481	#if !defined(__GNUC__) \|\| __GNUC__ > 3 \|\| (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
				482
				483	TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
				484	// Test the "comment_style" option.
				485
				486	const char* text = "foo # bar\n"
				487	"baz // qux\n"
				488	"corge /* grault */\n"
				489	"garply";
				490	const char* const kTokens[] = {"foo", // "# bar" is ignored
				491	"baz", "/", "/", "qux",
				492	"corge", "/", "", "grault", "", "/",
				493	"garply"};
				494
				495	// Set up the tokenizer.
				496	TestInputStream input(text, strlen(text), kBlockSizes_case);
				497	TestErrorCollector error_collector;
				498	Tokenizer tokenizer(&input, &error_collector);
				499	tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
				500
				501	// Advance through tokens and check that they are parsed as expected.
				502	for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
				503	EXPECT_TRUE(tokenizer.Next());
				504	EXPECT_EQ(tokenizer.current().text, kTokens[i]);
				505	}
				506
				507	// There should be no more input.
				508	EXPECT_FALSE(tokenizer.Next());
				509	// There should be no errors.
				510	EXPECT_TRUE(error_collector.text_.empty());
				511	}
				512
				513	#endif
				514
				515	// -------------------------------------------------------------------
				516
				517	// Test parse helpers. It's not really worth setting up a full data-driven
				518	// test here.
				519	TEST_F(TokenizerTest, ParseInteger) {
				520	EXPECT_EQ(0, ParseInteger("0"));
				521	EXPECT_EQ(123, ParseInteger("123"));
				522	EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
				523	EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
				524	EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
				525	EXPECT_EQ(01234567, ParseInteger("01234567"));
kenton@google.com	80b1d62	2009-07-29 01:13:20 +0000	[diff] [blame]	526	EXPECT_EQ(0X123, ParseInteger("0X123"));
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	527
				528	// Test invalid integers that may still be tokenized as integers.
				529	EXPECT_EQ(0, ParseInteger("0x"));
				530
temporal	f206351	2008-07-23 01:19:07 +0000	[diff] [blame]	531	uint64 i;
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	532	#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
				533	// Test invalid integers that will never be tokenized as integers.
temporal	f206351	2008-07-23 01:19:07 +0000	[diff] [blame]	534	EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	535	"passed text that could not have been tokenized as an integer");
temporal	f206351	2008-07-23 01:19:07 +0000	[diff] [blame]	536	EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	537	"passed text that could not have been tokenized as an integer");
temporal	f206351	2008-07-23 01:19:07 +0000	[diff] [blame]	538	EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	539	"passed text that could not have been tokenized as an integer");
temporal	f206351	2008-07-23 01:19:07 +0000	[diff] [blame]	540	EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	541	"passed text that could not have been tokenized as an integer");
temporal	f206351	2008-07-23 01:19:07 +0000	[diff] [blame]	542	EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	543	"passed text that could not have been tokenized as an integer");
				544	#endif // GTEST_HAS_DEATH_TEST
				545
				546	// Test overflows.
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	547	EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
				548	EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
				549	EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
				550	EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
				551	EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
				552	EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
				553	EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
				554	}
				555
				556	TEST_F(TokenizerTest, ParseFloat) {
				557	EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1."));
				558	EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3"));
				559	EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3"));
				560	EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
				561	EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1"));
				562	EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25"));
				563	EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
				564	EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
				565	EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
				566	EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
				567	EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5"));
				568	EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
				569	EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2"));
				570	EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
				571
				572	// Test invalid integers that may still be tokenized as integers.
				573	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
				574	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
				575	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
				576
				577	// Test 'f' suffix.
				578	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
				579	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
				580	EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
				581
				582	// These should parse successfully even though they are out of range.
				583	// Overflows become infinity and underflows become zero.
				584	EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
				585	EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
				586
				587	#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
				588	// Test invalid integers that will never be tokenized as integers.
				589	EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
				590	"passed text that could not have been tokenized as a float");
				591	EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
				592	"passed text that could not have been tokenized as a float");
				593	EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
				594	"passed text that could not have been tokenized as a float");
				595	#endif // GTEST_HAS_DEATH_TEST
				596	}
				597
				598	TEST_F(TokenizerTest, ParseString) {
				599	string output;
				600	Tokenizer::ParseString("'hello'", &output);
				601	EXPECT_EQ("hello", output);
				602	Tokenizer::ParseString("\"blah\\nblah2\"", &output);
				603	EXPECT_EQ("blah\nblah2", output);
				604	Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
				605	EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
				606	Tokenizer::ParseString("'\\x20\\x4'", &output);
				607	EXPECT_EQ("\x20\x4", output);
				608
				609	// Test invalid strings that may still be tokenized as strings.
				610	Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
				611	EXPECT_EQ("\a?\v\t", output);
				612	Tokenizer::ParseString("'", &output);
				613	EXPECT_EQ("", output);
				614	Tokenizer::ParseString("'\\", &output);
				615	EXPECT_EQ("\\", output);
				616
				617	// Test invalid strings that will never be tokenized as strings.
				618	#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
				619	EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
				620	"passed text that could not have been tokenized as a string");
				621	#endif // GTEST_HAS_DEATH_TEST
				622	}
				623
kenton@google.com	26bd9ee	2008-11-21 00:06:27 +0000	[diff] [blame]	624	TEST_F(TokenizerTest, ParseStringAppend) {
				625	// Check that ParseString and ParseStringAppend differ.
				626	string output("stuff+");
				627	Tokenizer::ParseStringAppend("'hello'", &output);
				628	EXPECT_EQ("stuff+hello", output);
				629	Tokenizer::ParseString("'hello'", &output);
				630	EXPECT_EQ("hello", output);
				631	}
				632
temporal	40ee551	2008-07-10 02:12:20 +0000	[diff] [blame]	633	// -------------------------------------------------------------------
				634
				635	// Each case parses some input text, ignoring the tokens produced, and
				636	// checks that the error output matches what is expected.
				637	struct ErrorCase {
				638	string input;
				639	bool recoverable; // True if the tokenizer should be able to recover and
				640	// parse more tokens after seeing this error. Cases
				641	// for which this is true must end with "foo" as
				642	// the last token, which the test will check for.
				643	const char* errors;
				644	};
				645
				646	inline ostream& operator<<(ostream& out,
				647	const ErrorCase& test_case) {
				648	return out << CEscape(test_case.input);
				649	}
				650
				651	ErrorCase kErrorCases[] = {
				652	// String errors.
				653	{ "'\\l' foo", true,
				654	"0:2: Invalid escape sequence in string literal.\n" },
				655	{ "'\\x' foo", true,
				656	"0:3: Expected hex digits for escape sequence.\n" },
				657	{ "'foo", false,
				658	"0:4: String literals cannot cross line boundaries.\n" },
				659	{ "'bar\nfoo", true,
				660	"0:4: String literals cannot cross line boundaries.\n" },
				661
				662	// Integer errors.
				663	{ "123foo", true,
				664	"0:3: Need space between number and identifier.\n" },
				665
				666	// Hex/octal errors.
				667	{ "0x foo", true,
				668	"0:2: \"0x\" must be followed by hex digits.\n" },
				669	{ "0541823 foo", true,
				670	"0:4: Numbers starting with leading zero must be in octal.\n" },
				671	{ "0x123z foo", true,
				672	"0:5: Need space between number and identifier.\n" },
				673	{ "0x123.4 foo", true,
				674	"0:5: Hex and octal numbers must be integers.\n" },
				675	{ "0123.4 foo", true,
				676	"0:4: Hex and octal numbers must be integers.\n" },
				677
				678	// Float errors.
				679	{ "1e foo", true,
				680	"0:2: \"e\" must be followed by exponent.\n" },
				681	{ "1e- foo", true,
				682	"0:3: \"e\" must be followed by exponent.\n" },
				683	{ "1.2.3 foo", true,
				684	"0:3: Already saw decimal point or exponent; can't have another one.\n" },
				685	{ "1e2.3 foo", true,
				686	"0:3: Already saw decimal point or exponent; can't have another one.\n" },
				687	{ "a.1 foo", true,
				688	"0:1: Need space between identifier and decimal point.\n" },
				689	// allow_f_after_float not enabled, so this should be an error.
				690	{ "1.0f foo", true,
				691	"0:3: Need space between number and identifier.\n" },
				692
				693	// Block comment errors.
				694	{ "/*", false,
				695	"0:2: End-of-file inside block comment.\n"
				696	"0:0: Comment started here.\n"},
				697	{ "/// foo", true,
				698	"0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
				699
				700	// Control characters. Multiple consecutive control characters should only
				701	// produce one error.
				702	{ "\b foo", true,
				703	"0:0: Invalid control characters encountered in text.\n" },
				704	{ "\b\b foo", true,
				705	"0:0: Invalid control characters encountered in text.\n" },
				706
				707	// Check that control characters at end of input don't result in an
				708	// infinite loop.
				709	{ "\b", false,
				710	"0:0: Invalid control characters encountered in text.\n" },
				711
				712	// Check recovery from '\0'. We have to explicitly specify the length of
				713	// these strings because otherwise the string constructor will just call
				714	// strlen() which will see the first '\0' and think that is the end of the
				715	// string.
				716	{ string("\0foo", 4), true,
				717	"0:0: Invalid control characters encountered in text.\n" },
				718	{ string("\0\0foo", 5), true,
				719	"0:0: Invalid control characters encountered in text.\n" },
				720	};
				721
				722	TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
				723	// Set up the tokenizer.
				724	TestInputStream input(kErrorCases_case.input.data(),
				725	kErrorCases_case.input.size(),
				726	kBlockSizes_case);
				727	TestErrorCollector error_collector;
				728	Tokenizer tokenizer(&input, &error_collector);
				729
				730	// Ignore all input, except remember if the last token was "foo".
				731	bool last_was_foo = false;
				732	while (tokenizer.Next()) {
				733	last_was_foo = tokenizer.current().text == "foo";
				734	}
				735
				736	// Check that the errors match what was expected.
				737	EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
				738
				739	// If the error was recoverable, make sure we saw "foo" after it.
				740	if (kErrorCases_case.recoverable) {
				741	EXPECT_TRUE(last_was_foo);
				742	}
				743	}
				744
				745	// -------------------------------------------------------------------
				746
				747	TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
				748	string text = "foo bar";
				749	TestInputStream input(text.data(), text.size(), kBlockSizes_case);
				750
				751	// Create a tokenizer, read one token, then destroy it.
				752	{
				753	TestErrorCollector error_collector;
				754	Tokenizer tokenizer(&input, &error_collector);
				755
				756	tokenizer.Next();
				757	}
				758
				759	// Only "foo" should have been read.
				760	EXPECT_EQ(strlen("foo"), input.ByteCount());
				761	}
				762
				763	} // namespace
				764	} // namespace io
				765	} // namespace protobuf
				766	} // namespace google