Blame - src/ProtocolBuffers/TextTokenizer.cs - platform/external/protobuf-javalite

blob: d53ae596a578e29259ac7be1b1672cd35af896fb [file] [log] [blame]

Jon Skeet	6803686	2008-10-22 13:30:34 +0100	[diff] [blame]	1	// Protocol Buffers - Google's data interchange format
				2	// Copyright 2008 Google Inc.
				3	// http://code.google.com/p/protobuf/
				4	//
				5	// Licensed under the Apache License, Version 2.0 (the "License");
				6	// you may not use this file except in compliance with the License.
				7	// You may obtain a copy of the License at
				8	//
				9	// http://www.apache.org/licenses/LICENSE-2.0
				10	//
				11	// Unless required by applicable law or agreed to in writing, software
				12	// distributed under the License is distributed on an "AS IS" BASIS,
				13	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	// See the License for the specific language governing permissions and
				15	// limitations under the License.
				16	using System;
				17	using System.Globalization;
				18	using System.Text.RegularExpressions;
				19
				20	namespace Google.ProtocolBuffers {
				21	/// <summary>
				22	/// Represents a stream of tokens parsed from a string.
				23	/// </summary>
				24	internal sealed class TextTokenizer {
				25	private readonly string text;
				26	private string currentToken;
				27
				28	/// <summary>
				29	/// The character index within the text to perform the next regex match at.
				30	/// </summary>
				31	private int matchPos = 0;
				32
				33	/// <summary>
				34	/// The character index within the text at which the current token begins.
				35	/// </summary>
				36	private int pos = 0;
				37
				38	/// <summary>
				39	/// The line number of the current token.
				40	/// </summary>
				41	private int line = 0;
				42	/// <summary>
				43	/// The column number of the current token.
				44	/// </summary>
				45	private int column = 0;
				46
				47	/// <summary>
				48	/// The line number of the previous token.
				49	/// </summary>
				50	private int previousLine = 0;
				51	/// <summary>
				52	/// The column number of the previous token.
				53	/// </summary>
				54	private int previousColumn = 0;
				55
				56	private static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s\|(#.*$))+",
				57	RegexOptions.Compiled \| RegexOptions.Multiline);
				58	private static readonly Regex TokenPattern = new Regex(
				59	"\\G[a-zA-Z_][0-9a-zA-Z_+-]*\|" + // an identifier
				60	"\\G[0-9+-][0-9a-zA-Z_.+-]*\|" + // a number
				61	"\\G\"([^\"\\\n\\\\]\|\\\\.)*(\"\|\\\\?$)\|" + // a double-quoted string
				62	"\\G\'([^\"\\\n\\\\]\|\\\\.)*(\'\|\\\\?$)", // a single-quoted string
				63	RegexOptions.Compiled \| RegexOptions.Multiline);
				64
				65	private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$", RegexOptions.Compiled \| RegexOptions.IgnoreCase);
				66	private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$", RegexOptions.Compiled \| RegexOptions.IgnoreCase);
				67	private static readonly Regex FloatNan = new Regex("^nanf?$", RegexOptions.Compiled \| RegexOptions.IgnoreCase);
				68
				69	/** Construct a tokenizer that parses tokens from the given text. */
				70	public TextTokenizer(string text) {
				71	this.text = text;
				72	SkipWhitespace();
				73	NextToken();
				74	}
				75
				76	/// <summary>
				77	/// Are we at the end of the input?
				78	/// </summary>
				79	public bool AtEnd {
				80	get { return currentToken.Length == 0; }
				81	}
				82
				83	/// <summary>
				84	/// Advances to the next token.
				85	/// </summary>
				86	public void NextToken() {
				87	previousLine = line;
				88	previousColumn = column;
				89
				90	// Advance the line counter to the current position.
				91	while (pos < matchPos) {
				92	if (text[pos] == '\n') {
				93	++line;
				94	column = 0;
				95	} else {
				96	++column;
				97	}
				98	++pos;
				99	}
				100
				101	// Match the next token.
				102	if (matchPos == text.Length) {
				103	// EOF
				104	currentToken = "";
				105	} else {
				106	Match match = TokenPattern.Match(text, matchPos);
				107	if (match.Success) {
				108	currentToken = match.Value;
				109	matchPos += match.Length;
				110	} else {
				111	// Take one character.
				112	currentToken = text[matchPos].ToString();
				113	matchPos++;
				114	}
				115
				116	SkipWhitespace();
				117	}
				118	}
				119
				120	/// <summary>
				121	/// Skip over any whitespace so that matchPos starts at the next token.
				122	/// </summary>
				123	private void SkipWhitespace() {
				124	Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
				125	if (match.Success) {
				126	matchPos += match.Length;
				127	}
				128	}
				129
				130	/// <summary>
				131	/// If the next token exactly matches the given token, consume it and return
				132	/// true. Otherwise, return false without doing anything.
				133	/// </summary>
				134	public bool TryConsume(string token) {
				135	if (currentToken == token) {
				136	NextToken();
				137	return true;
				138	}
				139	return false;
				140	}
				141
				142	/*
				143	* If the next token exactly matches {@code token}, consume it. Otherwise,
				144	* throw a {@link ParseException}.
				145	*/
				146	/// <summary>
				147	/// If the next token exactly matches the specified one, consume it.
				148	/// Otherwise, throw a FormatException.
				149	/// </summary>
				150	/// <param name="token"></param>
				151	public void Consume(string token) {
				152	if (!TryConsume(token)) {
				153	throw CreateFormatException("Expected \"" + token + "\".");
				154	}
				155	}
				156
				157	/// <summary>
				158	/// Returns true if the next token is an integer, but does not consume it.
				159	/// </summary>
				160	public bool LookingAtInteger() {
				161	if (currentToken.Length == 0) {
				162	return false;
				163	}
				164
				165	char c = currentToken[0];
				166	return ('0' <= c && c <= '9') \|\| c == '-' \|\| c == '+';
				167	}
				168
				169	/// <summary>
				170	/// If the next token is an identifier, consume it and return its value.
				171	/// Otherwise, throw a FormatException.
				172	/// </summary>
				173	public string ConsumeIdentifier() {
				174	foreach (char c in currentToken) {
				175	if (('a' <= c && c <= 'z') \|\|
				176	('A' <= c && c <= 'Z') \|\|
				177	('0' <= c && c <= '9') \|\|
				178	(c == '_') \|\| (c == '.')) {
				179	// OK
				180	} else {
				181	throw CreateFormatException("Expected identifier.");
				182	}
				183	}
				184
				185	string result = currentToken;
				186	NextToken();
				187	return result;
				188	}
				189
				190	/// <summary>
				191	/// If the next token is a 32-bit signed integer, consume it and return its
				192	/// value. Otherwise, throw a FormatException.
				193	/// </summary>
				194	public int ConsumeInt32() {
				195	try {
				196	int result = TextFormat.ParseInt32(currentToken);
				197	NextToken();
				198	return result;
				199	} catch (FormatException e) {
				200	throw CreateIntegerParseException(e);
				201	}
				202	}
				203
				204	/// <summary>
				205	/// If the next token is a 32-bit unsigned integer, consume it and return its
				206	/// value. Otherwise, throw a FormatException.
				207	/// </summary>
				208	public uint ConsumeUInt32() {
				209	try {
				210	uint result = TextFormat.ParseUInt32(currentToken);
				211	NextToken();
				212	return result;
				213	} catch (FormatException e) {
				214	throw CreateIntegerParseException(e);
				215	}
				216	}
				217
				218	/// <summary>
				219	/// If the next token is a 64-bit signed integer, consume it and return its
				220	/// value. Otherwise, throw a FormatException.
				221	/// </summary>
				222	public long ConsumeInt64() {
				223	try {
				224	long result = TextFormat.ParseInt64(currentToken);
				225	NextToken();
				226	return result;
				227	} catch (FormatException e) {
				228	throw CreateIntegerParseException(e);
				229	}
				230	}
				231
				232	/// <summary>
				233	/// If the next token is a 64-bit unsigned integer, consume it and return its
				234	/// value. Otherwise, throw a FormatException.
				235	/// </summary>
				236	public ulong ConsumeUInt64() {
				237	try {
				238	ulong result = TextFormat.ParseUInt64(currentToken);
				239	NextToken();
				240	return result;
				241	} catch (FormatException e) {
				242	throw CreateIntegerParseException(e);
				243	}
				244	}
				245
				246	/// <summary>
				247	/// If the next token is a double, consume it and return its value.
				248	/// Otherwise, throw a FormatException.
				249	/// </summary>
				250	public double ConsumeDouble() {
				251	// We need to parse infinity and nan separately because
				252	// double.Parse() does not accept "inf", "infinity", or "nan".
				253	if (DoubleInfinity.IsMatch(currentToken)) {
				254	bool negative = currentToken.StartsWith("-");
				255	NextToken();
				256	return negative ? double.NegativeInfinity : double.PositiveInfinity;
				257	}
				258	if (currentToken.Equals("nan", StringComparison.InvariantCultureIgnoreCase)) {
				259	NextToken();
				260	return Double.NaN;
				261	}
				262
				263	try {
				264	double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
				265	NextToken();
				266	return result;
				267	} catch (FormatException e) {
				268	throw CreateFloatParseException(e);
				269	} catch (OverflowException e) {
				270	throw CreateFloatParseException(e);
				271	}
				272	}
				273
				274	/// <summary>
				275	/// If the next token is a float, consume it and return its value.
				276	/// Otherwise, throw a FormatException.
				277	/// </summary>
				278	public float ConsumeFloat() {
				279
				280	// We need to parse infinity and nan separately because
				281	// Float.parseFloat() does not accept "inf", "infinity", or "nan".
				282	if (FloatInfinity.IsMatch(currentToken)) {
				283	bool negative = currentToken.StartsWith("-");
				284	NextToken();
				285	return negative ? float.NegativeInfinity : float.PositiveInfinity;
				286	}
				287	if (FloatNan.IsMatch(currentToken)) {
				288	NextToken();
				289	return float.NaN;
				290	}
				291
				292	if (currentToken.EndsWith("f")) {
				293	currentToken = currentToken.TrimEnd('f');
				294	}
				295
				296	try {
				297	float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
				298	NextToken();
				299	return result;
				300	} catch (FormatException e) {
				301	throw CreateFloatParseException(e);
				302	} catch (OverflowException e) {
				303	throw CreateFloatParseException(e);
				304	}
				305	}
				306
				307	/// <summary>
				308	/// If the next token is a Boolean, consume it and return its value.
				309	/// Otherwise, throw a FormatException.
				310	/// </summary>
				311	public bool ConsumeBoolean() {
				312	if (currentToken == "true") {
				313	NextToken();
				314	return true;
				315	}
				316	if (currentToken == "false") {
				317	NextToken();
				318	return false;
				319	}
				320	throw CreateFormatException("Expected \"true\" or \"false\".");
				321	}
				322
				323	/// <summary>
				324	/// If the next token is a string, consume it and return its (unescaped) value.
				325	/// Otherwise, throw a FormatException.
				326	/// </summary>
				327	public string ConsumeString() {
				328	return ConsumeByteString().ToStringUtf8();
				329	}
				330
				331	/// <summary>
				332	/// If the next token is a string, consume it, unescape it as a
				333	/// ByteString and return it. Otherwise, throw a FormatException.
				334	/// </summary>
				335	public ByteString ConsumeByteString() {
				336	char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
				337	if (quote != '\"' && quote != '\'') {
				338	throw CreateFormatException("Expected string.");
				339	}
				340
				341	if (currentToken.Length < 2 \|\|
				342	currentToken[currentToken.Length-1] != quote) {
				343	throw CreateFormatException("String missing ending quote.");
				344	}
				345
				346	try {
				347	string escaped = currentToken.Substring(1, currentToken.Length - 2);
				348	ByteString result = TextFormat.UnescapeBytes(escaped);
				349	NextToken();
				350	return result;
				351	} catch (FormatException e) {
				352	throw CreateFormatException(e.Message);
				353	}
				354	}
				355
				356	/// <summary>
				357	/// Returns a format exception with the current line and column numbers
				358	/// in the description, suitable for throwing.
				359	/// </summary>
				360	public FormatException CreateFormatException(string description) {
				361	// Note: People generally prefer one-based line and column numbers.
				362	return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
				363	}
				364
				365	/// <summary>
				366	/// Returns a format exception with the line and column numbers of the
				367	/// previous token in the description, suitable for throwing.
				368	/// </summary>
				369	public FormatException CreateFormatExceptionPreviousToken(string description) {
				370	// Note: People generally prefer one-based line and column numbers.
				371	return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
				372	}
				373
				374	/// <summary>
				375	/// Constructs an appropriate FormatException for the given existing exception
				376	/// when trying to parse an integer.
				377	/// </summary>
				378	private FormatException CreateIntegerParseException(FormatException e) {
				379	return CreateFormatException("Couldn't parse integer: " + e.Message);
				380	}
				381
				382	/// <summary>
				383	/// Constructs an appropriate FormatException for the given existing exception
				384	/// when trying to parse a float or double.
				385	/// </summary>
				386	private FormatException CreateFloatParseException(Exception e) {
				387	return CreateFormatException("Couldn't parse number: " + e.Message);
				388	}
				389	}
				390	}