Blame - src/ProtocolBuffers/TextTokenizer.cs - platform/external/protobuf-javalite

blob: 0787c03e48857b47c65fedff51073f8a93141d46 [file] [log] [blame]

Jon Skeet	60c059b	2008-10-23 21:17:56 +0100	[diff] [blame]	1	// Protocol Buffers - Google's data interchange format
				2	// Copyright 2008 Google Inc. All rights reserved.
				3	// http://github.com/jskeet/dotnet-protobufs/
				4	// Original C++/Java/Python code:
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	5	// http://code.google.com/p/protobuf/
				6	//
Jon Skeet	60c059b	2008-10-23 21:17:56 +0100	[diff] [blame]	7	// Redistribution and use in source and binary forms, with or without
				8	// modification, are permitted provided that the following conditions are
				9	// met:
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	10	//
Jon Skeet	60c059b	2008-10-23 21:17:56 +0100	[diff] [blame]	11	// * Redistributions of source code must retain the above copyright
				12	// notice, this list of conditions and the following disclaimer.
				13	// * Redistributions in binary form must reproduce the above
				14	// copyright notice, this list of conditions and the following disclaimer
				15	// in the documentation and/or other materials provided with the
				16	// distribution.
				17	// * Neither the name of Google Inc. nor the names of its
				18	// contributors may be used to endorse or promote products derived from
				19	// this software without specific prior written permission.
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	20	//
Jon Skeet	60c059b	2008-10-23 21:17:56 +0100	[diff] [blame]	21	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				22	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				23	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				24	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				25	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				26	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				27	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				28	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				29	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				30	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				31	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	32	using System;
				33	using System.Globalization;
				34	using System.Text.RegularExpressions;
				35
				36	namespace Google.ProtocolBuffers {
				37	/// <summary>
				38	/// Represents a stream of tokens parsed from a string.
				39	/// </summary>
				40	internal sealed class TextTokenizer {
				41	private readonly string text;
				42	private string currentToken;
				43
				44	/// <summary>
				45	/// The character index within the text to perform the next regex match at.
				46	/// </summary>
				47	private int matchPos = 0;
				48
				49	/// <summary>
				50	/// The character index within the text at which the current token begins.
				51	/// </summary>
				52	private int pos = 0;
				53
				54	/// <summary>
				55	/// The line number of the current token.
				56	/// </summary>
				57	private int line = 0;
				58	/// <summary>
				59	/// The column number of the current token.
				60	/// </summary>
				61	private int column = 0;
				62
				63	/// <summary>
				64	/// The line number of the previous token.
				65	/// </summary>
				66	private int previousLine = 0;
				67	/// <summary>
				68	/// The column number of the previous token.
				69	/// </summary>
				70	private int previousColumn = 0;
				71
Jon Skeet	0ca3fec	2009-01-27 14:56:10 +0000	[diff] [blame]	72	// Note: atomic groups used to mimic possessive quantifiers in Java in both of these regexes
				73	private static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(?>(\\s\|(#.*$))+)",
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	74	RegexOptions.Compiled \| RegexOptions.Multiline);
				75	private static readonly Regex TokenPattern = new Regex(
Jon Skeet	0ca3fec	2009-01-27 14:56:10 +0000	[diff] [blame]	76	"\\G[a-zA-Z_](?>[0-9a-zA-Z_+-]*)\|" + // an identifier
				77	"\\G[0-9+-](?>[0-9a-zA-Z_.+-]*)\|" + // a number
				78	"\\G\"(?>([^\"\\\n\\\\]\|\\\\.)*)(\"\|\\\\?$)\|" + // a double-quoted string
				79	"\\G\'(?>([^\"\\\n\\\\]\|\\\\.)*)(\'\|\\\\?$)", // a single-quoted string
Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	80	RegexOptions.Compiled \| RegexOptions.Multiline);
				81
				82	private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$", RegexOptions.Compiled \| RegexOptions.IgnoreCase);
				83	private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$", RegexOptions.Compiled \| RegexOptions.IgnoreCase);
				84	private static readonly Regex FloatNan = new Regex("^nanf?$", RegexOptions.Compiled \| RegexOptions.IgnoreCase);
				85
				86	/** Construct a tokenizer that parses tokens from the given text. */
				87	public TextTokenizer(string text) {
				88	this.text = text;
				89	SkipWhitespace();
				90	NextToken();
				91	}
				92
				93	/// <summary>
				94	/// Are we at the end of the input?
				95	/// </summary>
				96	public bool AtEnd {
				97	get { return currentToken.Length == 0; }
				98	}
				99
				100	/// <summary>
				101	/// Advances to the next token.
				102	/// </summary>
				103	public void NextToken() {
				104	previousLine = line;
				105	previousColumn = column;
				106
				107	// Advance the line counter to the current position.
				108	while (pos < matchPos) {
				109	if (text[pos] == '\n') {
				110	++line;
				111	column = 0;
				112	} else {
				113	++column;
				114	}
				115	++pos;
				116	}
				117
				118	// Match the next token.
				119	if (matchPos == text.Length) {
				120	// EOF
				121	currentToken = "";
				122	} else {
				123	Match match = TokenPattern.Match(text, matchPos);
				124	if (match.Success) {
				125	currentToken = match.Value;
				126	matchPos += match.Length;
				127	} else {
				128	// Take one character.
				129	currentToken = text[matchPos].ToString();
				130	matchPos++;
				131	}
				132
				133	SkipWhitespace();
				134	}
				135	}
				136
				137	/// <summary>
				138	/// Skip over any whitespace so that matchPos starts at the next token.
				139	/// </summary>
				140	private void SkipWhitespace() {
				141	Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
				142	if (match.Success) {
				143	matchPos += match.Length;
				144	}
				145	}
				146
				147	/// <summary>
				148	/// If the next token exactly matches the given token, consume it and return
				149	/// true. Otherwise, return false without doing anything.
				150	/// </summary>
				151	public bool TryConsume(string token) {
				152	if (currentToken == token) {
				153	NextToken();
				154	return true;
				155	}
				156	return false;
				157	}
				158
				159	/*
				160	* If the next token exactly matches {@code token}, consume it. Otherwise,
				161	* throw a {@link ParseException}.
				162	*/
				163	/// <summary>
				164	/// If the next token exactly matches the specified one, consume it.
				165	/// Otherwise, throw a FormatException.
				166	/// </summary>
				167	/// <param name="token"></param>
				168	public void Consume(string token) {
				169	if (!TryConsume(token)) {
				170	throw CreateFormatException("Expected \"" + token + "\".");
				171	}
				172	}
				173
				174	/// <summary>
				175	/// Returns true if the next token is an integer, but does not consume it.
				176	/// </summary>
				177	public bool LookingAtInteger() {
				178	if (currentToken.Length == 0) {
				179	return false;
				180	}
				181
				182	char c = currentToken[0];
				183	return ('0' <= c && c <= '9') \|\| c == '-' \|\| c == '+';
				184	}
				185
				186	/// <summary>
				187	/// If the next token is an identifier, consume it and return its value.
				188	/// Otherwise, throw a FormatException.
				189	/// </summary>
				190	public string ConsumeIdentifier() {
				191	foreach (char c in currentToken) {
				192	if (('a' <= c && c <= 'z') \|\|
				193	('A' <= c && c <= 'Z') \|\|
				194	('0' <= c && c <= '9') \|\|
				195	(c == '_') \|\| (c == '.')) {
				196	// OK
				197	} else {
				198	throw CreateFormatException("Expected identifier.");
				199	}
				200	}
				201
				202	string result = currentToken;
				203	NextToken();
				204	return result;
				205	}
				206
				207	/// <summary>
				208	/// If the next token is a 32-bit signed integer, consume it and return its
				209	/// value. Otherwise, throw a FormatException.
				210	/// </summary>
				211	public int ConsumeInt32() {
				212	try {
				213	int result = TextFormat.ParseInt32(currentToken);
				214	NextToken();
				215	return result;
				216	} catch (FormatException e) {
				217	throw CreateIntegerParseException(e);
				218	}
				219	}
				220
				221	/// <summary>
				222	/// If the next token is a 32-bit unsigned integer, consume it and return its
				223	/// value. Otherwise, throw a FormatException.
				224	/// </summary>
				225	public uint ConsumeUInt32() {
				226	try {
				227	uint result = TextFormat.ParseUInt32(currentToken);
				228	NextToken();
				229	return result;
				230	} catch (FormatException e) {
				231	throw CreateIntegerParseException(e);
				232	}
				233	}
				234
				235	/// <summary>
				236	/// If the next token is a 64-bit signed integer, consume it and return its
				237	/// value. Otherwise, throw a FormatException.
				238	/// </summary>
				239	public long ConsumeInt64() {
				240	try {
				241	long result = TextFormat.ParseInt64(currentToken);
				242	NextToken();
				243	return result;
				244	} catch (FormatException e) {
				245	throw CreateIntegerParseException(e);
				246	}
				247	}
				248
				249	/// <summary>
				250	/// If the next token is a 64-bit unsigned integer, consume it and return its
				251	/// value. Otherwise, throw a FormatException.
				252	/// </summary>
				253	public ulong ConsumeUInt64() {
				254	try {
				255	ulong result = TextFormat.ParseUInt64(currentToken);
				256	NextToken();
				257	return result;
				258	} catch (FormatException e) {
				259	throw CreateIntegerParseException(e);
				260	}
				261	}
				262
				263	/// <summary>
				264	/// If the next token is a double, consume it and return its value.
				265	/// Otherwise, throw a FormatException.
				266	/// </summary>
				267	public double ConsumeDouble() {
				268	// We need to parse infinity and nan separately because
				269	// double.Parse() does not accept "inf", "infinity", or "nan".
				270	if (DoubleInfinity.IsMatch(currentToken)) {
				271	bool negative = currentToken.StartsWith("-");
				272	NextToken();
				273	return negative ? double.NegativeInfinity : double.PositiveInfinity;
				274	}
				275	if (currentToken.Equals("nan", StringComparison.InvariantCultureIgnoreCase)) {
				276	NextToken();
				277	return Double.NaN;
				278	}
				279
				280	try {
				281	double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
				282	NextToken();
				283	return result;
				284	} catch (FormatException e) {
				285	throw CreateFloatParseException(e);
				286	} catch (OverflowException e) {
				287	throw CreateFloatParseException(e);
				288	}
				289	}
				290
				291	/// <summary>
				292	/// If the next token is a float, consume it and return its value.
				293	/// Otherwise, throw a FormatException.
				294	/// </summary>
				295	public float ConsumeFloat() {
				296
				297	// We need to parse infinity and nan separately because
				298	// Float.parseFloat() does not accept "inf", "infinity", or "nan".
				299	if (FloatInfinity.IsMatch(currentToken)) {
				300	bool negative = currentToken.StartsWith("-");
				301	NextToken();
				302	return negative ? float.NegativeInfinity : float.PositiveInfinity;
				303	}
				304	if (FloatNan.IsMatch(currentToken)) {
				305	NextToken();
				306	return float.NaN;
				307	}
				308
				309	if (currentToken.EndsWith("f")) {
				310	currentToken = currentToken.TrimEnd('f');
				311	}
				312
				313	try {
				314	float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
				315	NextToken();
				316	return result;
				317	} catch (FormatException e) {
				318	throw CreateFloatParseException(e);
				319	} catch (OverflowException e) {
				320	throw CreateFloatParseException(e);
				321	}
				322	}
				323
				324	/// <summary>
				325	/// If the next token is a Boolean, consume it and return its value.
				326	/// Otherwise, throw a FormatException.
				327	/// </summary>
				328	public bool ConsumeBoolean() {
				329	if (currentToken == "true") {
				330	NextToken();
				331	return true;
				332	}
				333	if (currentToken == "false") {
				334	NextToken();
				335	return false;
				336	}
				337	throw CreateFormatException("Expected \"true\" or \"false\".");
				338	}
				339
				340	/// <summary>
				341	/// If the next token is a string, consume it and return its (unescaped) value.
				342	/// Otherwise, throw a FormatException.
				343	/// </summary>
				344	public string ConsumeString() {
				345	return ConsumeByteString().ToStringUtf8();
				346	}
				347
				348	/// <summary>
				349	/// If the next token is a string, consume it, unescape it as a
				350	/// ByteString and return it. Otherwise, throw a FormatException.
				351	/// </summary>
				352	public ByteString ConsumeByteString() {
				353	char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
				354	if (quote != '\"' && quote != '\'') {
				355	throw CreateFormatException("Expected string.");
				356	}
				357
				358	if (currentToken.Length < 2 \|\|
				359	currentToken[currentToken.Length-1] != quote) {
				360	throw CreateFormatException("String missing ending quote.");
				361	}
				362
				363	try {
				364	string escaped = currentToken.Substring(1, currentToken.Length - 2);
				365	ByteString result = TextFormat.UnescapeBytes(escaped);
				366	NextToken();
				367	return result;
				368	} catch (FormatException e) {
				369	throw CreateFormatException(e.Message);
				370	}
				371	}
				372
				373	/// <summary>
				374	/// Returns a format exception with the current line and column numbers
				375	/// in the description, suitable for throwing.
				376	/// </summary>
				377	public FormatException CreateFormatException(string description) {
				378	// Note: People generally prefer one-based line and column numbers.
				379	return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
				380	}
				381
				382	/// <summary>
				383	/// Returns a format exception with the line and column numbers of the
				384	/// previous token in the description, suitable for throwing.
				385	/// </summary>
				386	public FormatException CreateFormatExceptionPreviousToken(string description) {
				387	// Note: People generally prefer one-based line and column numbers.
				388	return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
				389	}
				390
				391	/// <summary>
				392	/// Constructs an appropriate FormatException for the given existing exception
				393	/// when trying to parse an integer.
				394	/// </summary>
				395	private FormatException CreateIntegerParseException(FormatException e) {
				396	return CreateFormatException("Couldn't parse integer: " + e.Message);
				397	}
				398
				399	/// <summary>
				400	/// Constructs an appropriate FormatException for the given existing exception
				401	/// when trying to parse a float or double.
				402	/// </summary>
				403	private FormatException CreateFloatParseException(Exception e) {
				404	return CreateFormatException("Couldn't parse number: " + e.Message);
				405	}
				406	}
				407	}