Blame - src/ProtocolBuffers/TextTokenizer.cs - platform/external/protobuf-javalite

blob: ecd031abcc85ee8e91b2d3c5b8e6b5bc89a73906 [file] [log] [blame]

Jon Skeet	60c059b	2008-10-23 21:17:56 +0100	[diff] [blame]	1	// Protocol Buffers - Google's data interchange format
				2	// Copyright 2008 Google Inc. All rights reserved.
				3	// http://github.com/jskeet/dotnet-protobufs/
				4	// Original C++/Java/Python code:
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	5	// http://code.google.com/p/protobuf/
				6	//
Jon Skeet	60c059b	2008-10-23 21:17:56 +0100	[diff] [blame]	7	// Redistribution and use in source and binary forms, with or without
				8	// modification, are permitted provided that the following conditions are
				9	// met:
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	10	//
Jon Skeet	60c059b	2008-10-23 21:17:56 +0100	[diff] [blame]	11	// * Redistributions of source code must retain the above copyright
				12	// notice, this list of conditions and the following disclaimer.
				13	// * Redistributions in binary form must reproduce the above
				14	// copyright notice, this list of conditions and the following disclaimer
				15	// in the documentation and/or other materials provided with the
				16	// distribution.
				17	// * Neither the name of Google Inc. nor the names of its
				18	// contributors may be used to endorse or promote products derived from
				19	// this software without specific prior written permission.
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	20	//
Jon Skeet	60c059b	2008-10-23 21:17:56 +0100	[diff] [blame]	21	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				22	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				23	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				24	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				25	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				26	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				27	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				28	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				29	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				30	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				31	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	32	using System;
				33	using System.Globalization;
				34	using System.Text.RegularExpressions;
				35
				36	namespace Google.ProtocolBuffers {
				37	/// <summary>
				38	/// Represents a stream of tokens parsed from a string.
				39	/// </summary>
				40	internal sealed class TextTokenizer {
				41	private readonly string text;
				42	private string currentToken;
				43
				44	/// <summary>
				45	/// The character index within the text to perform the next regex match at.
				46	/// </summary>
				47	private int matchPos = 0;
				48
				49	/// <summary>
				50	/// The character index within the text at which the current token begins.
				51	/// </summary>
				52	private int pos = 0;
				53
				54	/// <summary>
				55	/// The line number of the current token.
				56	/// </summary>
				57	private int line = 0;
				58	/// <summary>
				59	/// The column number of the current token.
				60	/// </summary>
				61	private int column = 0;
				62
				63	/// <summary>
				64	/// The line number of the previous token.
				65	/// </summary>
				66	private int previousLine = 0;
				67	/// <summary>
				68	/// The column number of the previous token.
				69	/// </summary>
				70	private int previousColumn = 0;
				71
				72	private static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s\|(#.*$))+",
				73	RegexOptions.Compiled \| RegexOptions.Multiline);
				74	private static readonly Regex TokenPattern = new Regex(
				75	"\\G[a-zA-Z_][0-9a-zA-Z_+-]*\|" + // an identifier
				76	"\\G[0-9+-][0-9a-zA-Z_.+-]*\|" + // a number
				77	"\\G\"([^\"\\\n\\\\]\|\\\\.)*(\"\|\\\\?$)\|" + // a double-quoted string
				78	"\\G\'([^\"\\\n\\\\]\|\\\\.)*(\'\|\\\\?$)", // a single-quoted string
				79	RegexOptions.Compiled \| RegexOptions.Multiline);
				80
				81	private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$", RegexOptions.Compiled \| RegexOptions.IgnoreCase);
				82	private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$", RegexOptions.Compiled \| RegexOptions.IgnoreCase);
				83	private static readonly Regex FloatNan = new Regex("^nanf?$", RegexOptions.Compiled \| RegexOptions.IgnoreCase);
				84
				85	/** Construct a tokenizer that parses tokens from the given text. */
				86	public TextTokenizer(string text) {
				87	this.text = text;
				88	SkipWhitespace();
				89	NextToken();
				90	}
				91
				92	/// <summary>
				93	/// Are we at the end of the input?
				94	/// </summary>
				95	public bool AtEnd {
				96	get { return currentToken.Length == 0; }
				97	}
				98
				99	/// <summary>
				100	/// Advances to the next token.
				101	/// </summary>
				102	public void NextToken() {
				103	previousLine = line;
				104	previousColumn = column;
				105
				106	// Advance the line counter to the current position.
				107	while (pos < matchPos) {
				108	if (text[pos] == '\n') {
				109	++line;
				110	column = 0;
				111	} else {
				112	++column;
				113	}
				114	++pos;
				115	}
				116
				117	// Match the next token.
				118	if (matchPos == text.Length) {
				119	// EOF
				120	currentToken = "";
				121	} else {
				122	Match match = TokenPattern.Match(text, matchPos);
				123	if (match.Success) {
				124	currentToken = match.Value;
				125	matchPos += match.Length;
				126	} else {
				127	// Take one character.
				128	currentToken = text[matchPos].ToString();
				129	matchPos++;
				130	}
				131
				132	SkipWhitespace();
				133	}
				134	}
				135
				136	/// <summary>
				137	/// Skip over any whitespace so that matchPos starts at the next token.
				138	/// </summary>
				139	private void SkipWhitespace() {
				140	Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
				141	if (match.Success) {
				142	matchPos += match.Length;
				143	}
				144	}
				145
				146	/// <summary>
				147	/// If the next token exactly matches the given token, consume it and return
				148	/// true. Otherwise, return false without doing anything.
				149	/// </summary>
				150	public bool TryConsume(string token) {
				151	if (currentToken == token) {
				152	NextToken();
				153	return true;
				154	}
				155	return false;
				156	}
				157
				158	/*
				159	* If the next token exactly matches {@code token}, consume it. Otherwise,
				160	* throw a {@link ParseException}.
				161	*/
				162	/// <summary>
				163	/// If the next token exactly matches the specified one, consume it.
				164	/// Otherwise, throw a FormatException.
				165	/// </summary>
				166	/// <param name="token"></param>
				167	public void Consume(string token) {
				168	if (!TryConsume(token)) {
				169	throw CreateFormatException("Expected \"" + token + "\".");
				170	}
				171	}
				172
				173	/// <summary>
				174	/// Returns true if the next token is an integer, but does not consume it.
				175	/// </summary>
				176	public bool LookingAtInteger() {
				177	if (currentToken.Length == 0) {
				178	return false;
				179	}
				180
				181	char c = currentToken[0];
				182	return ('0' <= c && c <= '9') \|\| c == '-' \|\| c == '+';
				183	}
				184
				185	/// <summary>
				186	/// If the next token is an identifier, consume it and return its value.
				187	/// Otherwise, throw a FormatException.
				188	/// </summary>
				189	public string ConsumeIdentifier() {
				190	foreach (char c in currentToken) {
				191	if (('a' <= c && c <= 'z') \|\|
				192	('A' <= c && c <= 'Z') \|\|
				193	('0' <= c && c <= '9') \|\|
				194	(c == '_') \|\| (c == '.')) {
				195	// OK
				196	} else {
				197	throw CreateFormatException("Expected identifier.");
				198	}
				199	}
				200
				201	string result = currentToken;
				202	NextToken();
				203	return result;
				204	}
				205
				206	/// <summary>
				207	/// If the next token is a 32-bit signed integer, consume it and return its
				208	/// value. Otherwise, throw a FormatException.
				209	/// </summary>
				210	public int ConsumeInt32() {
				211	try {
				212	int result = TextFormat.ParseInt32(currentToken);
				213	NextToken();
				214	return result;
				215	} catch (FormatException e) {
				216	throw CreateIntegerParseException(e);
				217	}
				218	}
				219
				220	/// <summary>
				221	/// If the next token is a 32-bit unsigned integer, consume it and return its
				222	/// value. Otherwise, throw a FormatException.
				223	/// </summary>
				224	public uint ConsumeUInt32() {
				225	try {
				226	uint result = TextFormat.ParseUInt32(currentToken);
				227	NextToken();
				228	return result;
				229	} catch (FormatException e) {
				230	throw CreateIntegerParseException(e);
				231	}
				232	}
				233
				234	/// <summary>
				235	/// If the next token is a 64-bit signed integer, consume it and return its
				236	/// value. Otherwise, throw a FormatException.
				237	/// </summary>
				238	public long ConsumeInt64() {
				239	try {
				240	long result = TextFormat.ParseInt64(currentToken);
				241	NextToken();
				242	return result;
				243	} catch (FormatException e) {
				244	throw CreateIntegerParseException(e);
				245	}
				246	}
				247
				248	/// <summary>
				249	/// If the next token is a 64-bit unsigned integer, consume it and return its
				250	/// value. Otherwise, throw a FormatException.
				251	/// </summary>
				252	public ulong ConsumeUInt64() {
				253	try {
				254	ulong result = TextFormat.ParseUInt64(currentToken);
				255	NextToken();
				256	return result;
				257	} catch (FormatException e) {
				258	throw CreateIntegerParseException(e);
				259	}
				260	}
				261
				262	/// <summary>
				263	/// If the next token is a double, consume it and return its value.
				264	/// Otherwise, throw a FormatException.
				265	/// </summary>
				266	public double ConsumeDouble() {
				267	// We need to parse infinity and nan separately because
				268	// double.Parse() does not accept "inf", "infinity", or "nan".
				269	if (DoubleInfinity.IsMatch(currentToken)) {
				270	bool negative = currentToken.StartsWith("-");
				271	NextToken();
				272	return negative ? double.NegativeInfinity : double.PositiveInfinity;
				273	}
				274	if (currentToken.Equals("nan", StringComparison.InvariantCultureIgnoreCase)) {
				275	NextToken();
				276	return Double.NaN;
				277	}
				278
				279	try {
				280	double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
				281	NextToken();
				282	return result;
				283	} catch (FormatException e) {
				284	throw CreateFloatParseException(e);
				285	} catch (OverflowException e) {
				286	throw CreateFloatParseException(e);
				287	}
				288	}
				289
				290	/// <summary>
				291	/// If the next token is a float, consume it and return its value.
				292	/// Otherwise, throw a FormatException.
				293	/// </summary>
				294	public float ConsumeFloat() {
				295
				296	// We need to parse infinity and nan separately because
				297	// Float.parseFloat() does not accept "inf", "infinity", or "nan".
				298	if (FloatInfinity.IsMatch(currentToken)) {
				299	bool negative = currentToken.StartsWith("-");
				300	NextToken();
				301	return negative ? float.NegativeInfinity : float.PositiveInfinity;
				302	}
				303	if (FloatNan.IsMatch(currentToken)) {
				304	NextToken();
				305	return float.NaN;
				306	}
				307
				308	if (currentToken.EndsWith("f")) {
				309	currentToken = currentToken.TrimEnd('f');
				310	}
				311
				312	try {
				313	float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
				314	NextToken();
				315	return result;
				316	} catch (FormatException e) {
				317	throw CreateFloatParseException(e);
				318	} catch (OverflowException e) {
				319	throw CreateFloatParseException(e);
				320	}
				321	}
				322
				323	/// <summary>
				324	/// If the next token is a Boolean, consume it and return its value.
				325	/// Otherwise, throw a FormatException.
				326	/// </summary>
				327	public bool ConsumeBoolean() {
				328	if (currentToken == "true") {
				329	NextToken();
				330	return true;
				331	}
				332	if (currentToken == "false") {
				333	NextToken();
				334	return false;
				335	}
				336	throw CreateFormatException("Expected \"true\" or \"false\".");
				337	}
				338
				339	/// <summary>
				340	/// If the next token is a string, consume it and return its (unescaped) value.
				341	/// Otherwise, throw a FormatException.
				342	/// </summary>
				343	public string ConsumeString() {
				344	return ConsumeByteString().ToStringUtf8();
				345	}
				346
				347	/// <summary>
				348	/// If the next token is a string, consume it, unescape it as a
				349	/// ByteString and return it. Otherwise, throw a FormatException.
				350	/// </summary>
				351	public ByteString ConsumeByteString() {
				352	char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
				353	if (quote != '\"' && quote != '\'') {
				354	throw CreateFormatException("Expected string.");
				355	}
				356
				357	if (currentToken.Length < 2 \|\|
				358	currentToken[currentToken.Length-1] != quote) {
				359	throw CreateFormatException("String missing ending quote.");
				360	}
				361
				362	try {
				363	string escaped = currentToken.Substring(1, currentToken.Length - 2);
				364	ByteString result = TextFormat.UnescapeBytes(escaped);
				365	NextToken();
				366	return result;
				367	} catch (FormatException e) {
				368	throw CreateFormatException(e.Message);
				369	}
				370	}
				371
				372	/// <summary>
				373	/// Returns a format exception with the current line and column numbers
				374	/// in the description, suitable for throwing.
				375	/// </summary>
				376	public FormatException CreateFormatException(string description) {
				377	// Note: People generally prefer one-based line and column numbers.
				378	return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
				379	}
				380
				381	/// <summary>
				382	/// Returns a format exception with the line and column numbers of the
				383	/// previous token in the description, suitable for throwing.
				384	/// </summary>
				385	public FormatException CreateFormatExceptionPreviousToken(string description) {
				386	// Note: People generally prefer one-based line and column numbers.
				387	return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
				388	}
				389
				390	/// <summary>
				391	/// Constructs an appropriate FormatException for the given existing exception
				392	/// when trying to parse an integer.
				393	/// </summary>
				394	private FormatException CreateIntegerParseException(FormatException e) {
				395	return CreateFormatException("Couldn't parse integer: " + e.Message);
				396	}
				397
				398	/// <summary>
				399	/// Constructs an appropriate FormatException for the given existing exception
				400	/// when trying to parse a float or double.
				401	/// </summary>
				402	private FormatException CreateFloatParseException(Exception e) {
				403	return CreateFormatException("Couldn't parse number: " + e.Message);
				404	}
				405	}
				406	}