blob: ecd031abcc85ee8e91b2d3c5b8e6b5bc89a73906 [file] [log] [blame]
Jon Skeet60c059b2008-10-23 21:17:56 +01001// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc. All rights reserved.
3// http://github.com/jskeet/dotnet-protobufs/
4// Original C++/Java/Python code:
Jon Skeet68036862008-10-22 13:30:34 +01005// http://code.google.com/p/protobuf/
6//
Jon Skeet60c059b2008-10-23 21:17:56 +01007// Redistribution and use in source and binary forms, with or without
8// modification, are permitted provided that the following conditions are
9// met:
Jon Skeet68036862008-10-22 13:30:34 +010010//
Jon Skeet60c059b2008-10-23 21:17:56 +010011// * Redistributions of source code must retain the above copyright
12// notice, this list of conditions and the following disclaimer.
13// * Redistributions in binary form must reproduce the above
14// copyright notice, this list of conditions and the following disclaimer
15// in the documentation and/or other materials provided with the
16// distribution.
17// * Neither the name of Google Inc. nor the names of its
18// contributors may be used to endorse or promote products derived from
19// this software without specific prior written permission.
Jon Skeet68036862008-10-22 13:30:34 +010020//
Jon Skeet60c059b2008-10-23 21:17:56 +010021// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Jon Skeet68036862008-10-22 13:30:34 +010032using System;
33using System.Globalization;
34using System.Text.RegularExpressions;
35
36namespace Google.ProtocolBuffers {
37 /// <summary>
38 /// Represents a stream of tokens parsed from a string.
39 /// </summary>
40 internal sealed class TextTokenizer {
41 private readonly string text;
42 private string currentToken;
43
44 /// <summary>
45 /// The character index within the text to perform the next regex match at.
46 /// </summary>
47 private int matchPos = 0;
48
49 /// <summary>
50 /// The character index within the text at which the current token begins.
51 /// </summary>
52 private int pos = 0;
53
54 /// <summary>
55 /// The line number of the current token.
56 /// </summary>
57 private int line = 0;
58 /// <summary>
59 /// The column number of the current token.
60 /// </summary>
61 private int column = 0;
62
63 /// <summary>
64 /// The line number of the previous token.
65 /// </summary>
66 private int previousLine = 0;
67 /// <summary>
68 /// The column number of the previous token.
69 /// </summary>
70 private int previousColumn = 0;
71
72 private static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s|(#.*$))+",
73 RegexOptions.Compiled | RegexOptions.Multiline);
74 private static readonly Regex TokenPattern = new Regex(
75 "\\G[a-zA-Z_][0-9a-zA-Z_+-]*|" + // an identifier
76 "\\G[0-9+-][0-9a-zA-Z_.+-]*|" + // a number
77 "\\G\"([^\"\\\n\\\\]|\\\\.)*(\"|\\\\?$)|" + // a double-quoted string
78 "\\G\'([^\"\\\n\\\\]|\\\\.)*(\'|\\\\?$)", // a single-quoted string
79 RegexOptions.Compiled | RegexOptions.Multiline);
80
81 private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
82 private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
83 private static readonly Regex FloatNan = new Regex("^nanf?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
84
85 /** Construct a tokenizer that parses tokens from the given text. */
86 public TextTokenizer(string text) {
87 this.text = text;
88 SkipWhitespace();
89 NextToken();
90 }
91
92 /// <summary>
93 /// Are we at the end of the input?
94 /// </summary>
95 public bool AtEnd {
96 get { return currentToken.Length == 0; }
97 }
98
99 /// <summary>
100 /// Advances to the next token.
101 /// </summary>
102 public void NextToken() {
103 previousLine = line;
104 previousColumn = column;
105
106 // Advance the line counter to the current position.
107 while (pos < matchPos) {
108 if (text[pos] == '\n') {
109 ++line;
110 column = 0;
111 } else {
112 ++column;
113 }
114 ++pos;
115 }
116
117 // Match the next token.
118 if (matchPos == text.Length) {
119 // EOF
120 currentToken = "";
121 } else {
122 Match match = TokenPattern.Match(text, matchPos);
123 if (match.Success) {
124 currentToken = match.Value;
125 matchPos += match.Length;
126 } else {
127 // Take one character.
128 currentToken = text[matchPos].ToString();
129 matchPos++;
130 }
131
132 SkipWhitespace();
133 }
134 }
135
136 /// <summary>
137 /// Skip over any whitespace so that matchPos starts at the next token.
138 /// </summary>
139 private void SkipWhitespace() {
140 Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
141 if (match.Success) {
142 matchPos += match.Length;
143 }
144 }
145
146 /// <summary>
147 /// If the next token exactly matches the given token, consume it and return
148 /// true. Otherwise, return false without doing anything.
149 /// </summary>
150 public bool TryConsume(string token) {
151 if (currentToken == token) {
152 NextToken();
153 return true;
154 }
155 return false;
156 }
157
158 /*
159 * If the next token exactly matches {@code token}, consume it. Otherwise,
160 * throw a {@link ParseException}.
161 */
162 /// <summary>
163 /// If the next token exactly matches the specified one, consume it.
164 /// Otherwise, throw a FormatException.
165 /// </summary>
166 /// <param name="token"></param>
167 public void Consume(string token) {
168 if (!TryConsume(token)) {
169 throw CreateFormatException("Expected \"" + token + "\".");
170 }
171 }
172
173 /// <summary>
174 /// Returns true if the next token is an integer, but does not consume it.
175 /// </summary>
176 public bool LookingAtInteger() {
177 if (currentToken.Length == 0) {
178 return false;
179 }
180
181 char c = currentToken[0];
182 return ('0' <= c && c <= '9') || c == '-' || c == '+';
183 }
184
185 /// <summary>
186 /// If the next token is an identifier, consume it and return its value.
187 /// Otherwise, throw a FormatException.
188 /// </summary>
189 public string ConsumeIdentifier() {
190 foreach (char c in currentToken) {
191 if (('a' <= c && c <= 'z') ||
192 ('A' <= c && c <= 'Z') ||
193 ('0' <= c && c <= '9') ||
194 (c == '_') || (c == '.')) {
195 // OK
196 } else {
197 throw CreateFormatException("Expected identifier.");
198 }
199 }
200
201 string result = currentToken;
202 NextToken();
203 return result;
204 }
205
206 /// <summary>
207 /// If the next token is a 32-bit signed integer, consume it and return its
208 /// value. Otherwise, throw a FormatException.
209 /// </summary>
210 public int ConsumeInt32() {
211 try {
212 int result = TextFormat.ParseInt32(currentToken);
213 NextToken();
214 return result;
215 } catch (FormatException e) {
216 throw CreateIntegerParseException(e);
217 }
218 }
219
220 /// <summary>
221 /// If the next token is a 32-bit unsigned integer, consume it and return its
222 /// value. Otherwise, throw a FormatException.
223 /// </summary>
224 public uint ConsumeUInt32() {
225 try {
226 uint result = TextFormat.ParseUInt32(currentToken);
227 NextToken();
228 return result;
229 } catch (FormatException e) {
230 throw CreateIntegerParseException(e);
231 }
232 }
233
234 /// <summary>
235 /// If the next token is a 64-bit signed integer, consume it and return its
236 /// value. Otherwise, throw a FormatException.
237 /// </summary>
238 public long ConsumeInt64() {
239 try {
240 long result = TextFormat.ParseInt64(currentToken);
241 NextToken();
242 return result;
243 } catch (FormatException e) {
244 throw CreateIntegerParseException(e);
245 }
246 }
247
248 /// <summary>
249 /// If the next token is a 64-bit unsigned integer, consume it and return its
250 /// value. Otherwise, throw a FormatException.
251 /// </summary>
252 public ulong ConsumeUInt64() {
253 try {
254 ulong result = TextFormat.ParseUInt64(currentToken);
255 NextToken();
256 return result;
257 } catch (FormatException e) {
258 throw CreateIntegerParseException(e);
259 }
260 }
261
262 /// <summary>
263 /// If the next token is a double, consume it and return its value.
264 /// Otherwise, throw a FormatException.
265 /// </summary>
266 public double ConsumeDouble() {
267 // We need to parse infinity and nan separately because
268 // double.Parse() does not accept "inf", "infinity", or "nan".
269 if (DoubleInfinity.IsMatch(currentToken)) {
270 bool negative = currentToken.StartsWith("-");
271 NextToken();
272 return negative ? double.NegativeInfinity : double.PositiveInfinity;
273 }
274 if (currentToken.Equals("nan", StringComparison.InvariantCultureIgnoreCase)) {
275 NextToken();
276 return Double.NaN;
277 }
278
279 try {
280 double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
281 NextToken();
282 return result;
283 } catch (FormatException e) {
284 throw CreateFloatParseException(e);
285 } catch (OverflowException e) {
286 throw CreateFloatParseException(e);
287 }
288 }
289
290 /// <summary>
291 /// If the next token is a float, consume it and return its value.
292 /// Otherwise, throw a FormatException.
293 /// </summary>
294 public float ConsumeFloat() {
295
296 // We need to parse infinity and nan separately because
297 // Float.parseFloat() does not accept "inf", "infinity", or "nan".
298 if (FloatInfinity.IsMatch(currentToken)) {
299 bool negative = currentToken.StartsWith("-");
300 NextToken();
301 return negative ? float.NegativeInfinity : float.PositiveInfinity;
302 }
303 if (FloatNan.IsMatch(currentToken)) {
304 NextToken();
305 return float.NaN;
306 }
307
308 if (currentToken.EndsWith("f")) {
309 currentToken = currentToken.TrimEnd('f');
310 }
311
312 try {
313 float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
314 NextToken();
315 return result;
316 } catch (FormatException e) {
317 throw CreateFloatParseException(e);
318 } catch (OverflowException e) {
319 throw CreateFloatParseException(e);
320 }
321 }
322
323 /// <summary>
324 /// If the next token is a Boolean, consume it and return its value.
325 /// Otherwise, throw a FormatException.
326 /// </summary>
327 public bool ConsumeBoolean() {
328 if (currentToken == "true") {
329 NextToken();
330 return true;
331 }
332 if (currentToken == "false") {
333 NextToken();
334 return false;
335 }
336 throw CreateFormatException("Expected \"true\" or \"false\".");
337 }
338
339 /// <summary>
340 /// If the next token is a string, consume it and return its (unescaped) value.
341 /// Otherwise, throw a FormatException.
342 /// </summary>
343 public string ConsumeString() {
344 return ConsumeByteString().ToStringUtf8();
345 }
346
347 /// <summary>
348 /// If the next token is a string, consume it, unescape it as a
349 /// ByteString and return it. Otherwise, throw a FormatException.
350 /// </summary>
351 public ByteString ConsumeByteString() {
352 char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
353 if (quote != '\"' && quote != '\'') {
354 throw CreateFormatException("Expected string.");
355 }
356
357 if (currentToken.Length < 2 ||
358 currentToken[currentToken.Length-1] != quote) {
359 throw CreateFormatException("String missing ending quote.");
360 }
361
362 try {
363 string escaped = currentToken.Substring(1, currentToken.Length - 2);
364 ByteString result = TextFormat.UnescapeBytes(escaped);
365 NextToken();
366 return result;
367 } catch (FormatException e) {
368 throw CreateFormatException(e.Message);
369 }
370 }
371
372 /// <summary>
373 /// Returns a format exception with the current line and column numbers
374 /// in the description, suitable for throwing.
375 /// </summary>
376 public FormatException CreateFormatException(string description) {
377 // Note: People generally prefer one-based line and column numbers.
378 return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
379 }
380
381 /// <summary>
382 /// Returns a format exception with the line and column numbers of the
383 /// previous token in the description, suitable for throwing.
384 /// </summary>
385 public FormatException CreateFormatExceptionPreviousToken(string description) {
386 // Note: People generally prefer one-based line and column numbers.
387 return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
388 }
389
390 /// <summary>
391 /// Constructs an appropriate FormatException for the given existing exception
392 /// when trying to parse an integer.
393 /// </summary>
394 private FormatException CreateIntegerParseException(FormatException e) {
395 return CreateFormatException("Couldn't parse integer: " + e.Message);
396 }
397
398 /// <summary>
399 /// Constructs an appropriate FormatException for the given existing exception
400 /// when trying to parse a float or double.
401 /// </summary>
402 private FormatException CreateFloatParseException(Exception e) {
403 return CreateFormatException("Couldn't parse number: " + e.Message);
404 }
405 }
406}