blob: 0787c03e48857b47c65fedff51073f8a93141d46 [file] [log] [blame]
Jon Skeet60c059b2008-10-23 21:17:56 +01001// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc. All rights reserved.
3// http://github.com/jskeet/dotnet-protobufs/
4// Original C++/Java/Python code:
Jon Skeet68036862008-10-22 13:30:34 +01005// http://code.google.com/p/protobuf/
6//
Jon Skeet60c059b2008-10-23 21:17:56 +01007// Redistribution and use in source and binary forms, with or without
8// modification, are permitted provided that the following conditions are
9// met:
Jon Skeet68036862008-10-22 13:30:34 +010010//
Jon Skeet60c059b2008-10-23 21:17:56 +010011// * Redistributions of source code must retain the above copyright
12// notice, this list of conditions and the following disclaimer.
13// * Redistributions in binary form must reproduce the above
14// copyright notice, this list of conditions and the following disclaimer
15// in the documentation and/or other materials provided with the
16// distribution.
17// * Neither the name of Google Inc. nor the names of its
18// contributors may be used to endorse or promote products derived from
19// this software without specific prior written permission.
Jon Skeet68036862008-10-22 13:30:34 +010020//
Jon Skeet60c059b2008-10-23 21:17:56 +010021// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Jon Skeet68036862008-10-22 13:30:34 +010032using System;
33using System.Globalization;
34using System.Text.RegularExpressions;
35
36namespace Google.ProtocolBuffers {
37 /// <summary>
38 /// Represents a stream of tokens parsed from a string.
39 /// </summary>
40 internal sealed class TextTokenizer {
41 private readonly string text;
42 private string currentToken;
43
44 /// <summary>
45 /// The character index within the text to perform the next regex match at.
46 /// </summary>
47 private int matchPos = 0;
48
49 /// <summary>
50 /// The character index within the text at which the current token begins.
51 /// </summary>
52 private int pos = 0;
53
54 /// <summary>
55 /// The line number of the current token.
56 /// </summary>
57 private int line = 0;
58 /// <summary>
59 /// The column number of the current token.
60 /// </summary>
61 private int column = 0;
62
63 /// <summary>
64 /// The line number of the previous token.
65 /// </summary>
66 private int previousLine = 0;
67 /// <summary>
68 /// The column number of the previous token.
69 /// </summary>
70 private int previousColumn = 0;
71
Jon Skeet0ca3fec2009-01-27 14:56:10 +000072 // Note: atomic groups used to mimic possessive quantifiers in Java in both of these regexes
73 private static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(?>(\\s|(#.*$))+)",
Jon Skeet68036862008-10-22 13:30:34 +010074 RegexOptions.Compiled | RegexOptions.Multiline);
75 private static readonly Regex TokenPattern = new Regex(
Jon Skeet0ca3fec2009-01-27 14:56:10 +000076 "\\G[a-zA-Z_](?>[0-9a-zA-Z_+-]*)|" + // an identifier
77 "\\G[0-9+-](?>[0-9a-zA-Z_.+-]*)|" + // a number
78 "\\G\"(?>([^\"\\\n\\\\]|\\\\.)*)(\"|\\\\?$)|" + // a double-quoted string
79 "\\G\'(?>([^\"\\\n\\\\]|\\\\.)*)(\'|\\\\?$)", // a single-quoted string
Jon Skeet68036862008-10-22 13:30:34 +010080 RegexOptions.Compiled | RegexOptions.Multiline);
81
82 private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
83 private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
84 private static readonly Regex FloatNan = new Regex("^nanf?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
85
86 /** Construct a tokenizer that parses tokens from the given text. */
87 public TextTokenizer(string text) {
88 this.text = text;
89 SkipWhitespace();
90 NextToken();
91 }
92
93 /// <summary>
94 /// Are we at the end of the input?
95 /// </summary>
96 public bool AtEnd {
97 get { return currentToken.Length == 0; }
98 }
99
100 /// <summary>
101 /// Advances to the next token.
102 /// </summary>
103 public void NextToken() {
104 previousLine = line;
105 previousColumn = column;
106
107 // Advance the line counter to the current position.
108 while (pos < matchPos) {
109 if (text[pos] == '\n') {
110 ++line;
111 column = 0;
112 } else {
113 ++column;
114 }
115 ++pos;
116 }
117
118 // Match the next token.
119 if (matchPos == text.Length) {
120 // EOF
121 currentToken = "";
122 } else {
123 Match match = TokenPattern.Match(text, matchPos);
124 if (match.Success) {
125 currentToken = match.Value;
126 matchPos += match.Length;
127 } else {
128 // Take one character.
129 currentToken = text[matchPos].ToString();
130 matchPos++;
131 }
132
133 SkipWhitespace();
134 }
135 }
136
137 /// <summary>
138 /// Skip over any whitespace so that matchPos starts at the next token.
139 /// </summary>
140 private void SkipWhitespace() {
141 Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
142 if (match.Success) {
143 matchPos += match.Length;
144 }
145 }
146
147 /// <summary>
148 /// If the next token exactly matches the given token, consume it and return
149 /// true. Otherwise, return false without doing anything.
150 /// </summary>
151 public bool TryConsume(string token) {
152 if (currentToken == token) {
153 NextToken();
154 return true;
155 }
156 return false;
157 }
158
159 /*
160 * If the next token exactly matches {@code token}, consume it. Otherwise,
161 * throw a {@link ParseException}.
162 */
163 /// <summary>
164 /// If the next token exactly matches the specified one, consume it.
165 /// Otherwise, throw a FormatException.
166 /// </summary>
167 /// <param name="token"></param>
168 public void Consume(string token) {
169 if (!TryConsume(token)) {
170 throw CreateFormatException("Expected \"" + token + "\".");
171 }
172 }
173
174 /// <summary>
175 /// Returns true if the next token is an integer, but does not consume it.
176 /// </summary>
177 public bool LookingAtInteger() {
178 if (currentToken.Length == 0) {
179 return false;
180 }
181
182 char c = currentToken[0];
183 return ('0' <= c && c <= '9') || c == '-' || c == '+';
184 }
185
186 /// <summary>
187 /// If the next token is an identifier, consume it and return its value.
188 /// Otherwise, throw a FormatException.
189 /// </summary>
190 public string ConsumeIdentifier() {
191 foreach (char c in currentToken) {
192 if (('a' <= c && c <= 'z') ||
193 ('A' <= c && c <= 'Z') ||
194 ('0' <= c && c <= '9') ||
195 (c == '_') || (c == '.')) {
196 // OK
197 } else {
198 throw CreateFormatException("Expected identifier.");
199 }
200 }
201
202 string result = currentToken;
203 NextToken();
204 return result;
205 }
206
207 /// <summary>
208 /// If the next token is a 32-bit signed integer, consume it and return its
209 /// value. Otherwise, throw a FormatException.
210 /// </summary>
211 public int ConsumeInt32() {
212 try {
213 int result = TextFormat.ParseInt32(currentToken);
214 NextToken();
215 return result;
216 } catch (FormatException e) {
217 throw CreateIntegerParseException(e);
218 }
219 }
220
221 /// <summary>
222 /// If the next token is a 32-bit unsigned integer, consume it and return its
223 /// value. Otherwise, throw a FormatException.
224 /// </summary>
225 public uint ConsumeUInt32() {
226 try {
227 uint result = TextFormat.ParseUInt32(currentToken);
228 NextToken();
229 return result;
230 } catch (FormatException e) {
231 throw CreateIntegerParseException(e);
232 }
233 }
234
235 /// <summary>
236 /// If the next token is a 64-bit signed integer, consume it and return its
237 /// value. Otherwise, throw a FormatException.
238 /// </summary>
239 public long ConsumeInt64() {
240 try {
241 long result = TextFormat.ParseInt64(currentToken);
242 NextToken();
243 return result;
244 } catch (FormatException e) {
245 throw CreateIntegerParseException(e);
246 }
247 }
248
249 /// <summary>
250 /// If the next token is a 64-bit unsigned integer, consume it and return its
251 /// value. Otherwise, throw a FormatException.
252 /// </summary>
253 public ulong ConsumeUInt64() {
254 try {
255 ulong result = TextFormat.ParseUInt64(currentToken);
256 NextToken();
257 return result;
258 } catch (FormatException e) {
259 throw CreateIntegerParseException(e);
260 }
261 }
262
263 /// <summary>
264 /// If the next token is a double, consume it and return its value.
265 /// Otherwise, throw a FormatException.
266 /// </summary>
267 public double ConsumeDouble() {
268 // We need to parse infinity and nan separately because
269 // double.Parse() does not accept "inf", "infinity", or "nan".
270 if (DoubleInfinity.IsMatch(currentToken)) {
271 bool negative = currentToken.StartsWith("-");
272 NextToken();
273 return negative ? double.NegativeInfinity : double.PositiveInfinity;
274 }
275 if (currentToken.Equals("nan", StringComparison.InvariantCultureIgnoreCase)) {
276 NextToken();
277 return Double.NaN;
278 }
279
280 try {
281 double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
282 NextToken();
283 return result;
284 } catch (FormatException e) {
285 throw CreateFloatParseException(e);
286 } catch (OverflowException e) {
287 throw CreateFloatParseException(e);
288 }
289 }
290
291 /// <summary>
292 /// If the next token is a float, consume it and return its value.
293 /// Otherwise, throw a FormatException.
294 /// </summary>
295 public float ConsumeFloat() {
296
297 // We need to parse infinity and nan separately because
298 // Float.parseFloat() does not accept "inf", "infinity", or "nan".
299 if (FloatInfinity.IsMatch(currentToken)) {
300 bool negative = currentToken.StartsWith("-");
301 NextToken();
302 return negative ? float.NegativeInfinity : float.PositiveInfinity;
303 }
304 if (FloatNan.IsMatch(currentToken)) {
305 NextToken();
306 return float.NaN;
307 }
308
309 if (currentToken.EndsWith("f")) {
310 currentToken = currentToken.TrimEnd('f');
311 }
312
313 try {
314 float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
315 NextToken();
316 return result;
317 } catch (FormatException e) {
318 throw CreateFloatParseException(e);
319 } catch (OverflowException e) {
320 throw CreateFloatParseException(e);
321 }
322 }
323
324 /// <summary>
325 /// If the next token is a Boolean, consume it and return its value.
326 /// Otherwise, throw a FormatException.
327 /// </summary>
328 public bool ConsumeBoolean() {
329 if (currentToken == "true") {
330 NextToken();
331 return true;
332 }
333 if (currentToken == "false") {
334 NextToken();
335 return false;
336 }
337 throw CreateFormatException("Expected \"true\" or \"false\".");
338 }
339
340 /// <summary>
341 /// If the next token is a string, consume it and return its (unescaped) value.
342 /// Otherwise, throw a FormatException.
343 /// </summary>
344 public string ConsumeString() {
345 return ConsumeByteString().ToStringUtf8();
346 }
347
348 /// <summary>
349 /// If the next token is a string, consume it, unescape it as a
350 /// ByteString and return it. Otherwise, throw a FormatException.
351 /// </summary>
352 public ByteString ConsumeByteString() {
353 char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
354 if (quote != '\"' && quote != '\'') {
355 throw CreateFormatException("Expected string.");
356 }
357
358 if (currentToken.Length < 2 ||
359 currentToken[currentToken.Length-1] != quote) {
360 throw CreateFormatException("String missing ending quote.");
361 }
362
363 try {
364 string escaped = currentToken.Substring(1, currentToken.Length - 2);
365 ByteString result = TextFormat.UnescapeBytes(escaped);
366 NextToken();
367 return result;
368 } catch (FormatException e) {
369 throw CreateFormatException(e.Message);
370 }
371 }
372
373 /// <summary>
374 /// Returns a format exception with the current line and column numbers
375 /// in the description, suitable for throwing.
376 /// </summary>
377 public FormatException CreateFormatException(string description) {
378 // Note: People generally prefer one-based line and column numbers.
379 return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
380 }
381
382 /// <summary>
383 /// Returns a format exception with the line and column numbers of the
384 /// previous token in the description, suitable for throwing.
385 /// </summary>
386 public FormatException CreateFormatExceptionPreviousToken(string description) {
387 // Note: People generally prefer one-based line and column numbers.
388 return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
389 }
390
391 /// <summary>
392 /// Constructs an appropriate FormatException for the given existing exception
393 /// when trying to parse an integer.
394 /// </summary>
395 private FormatException CreateIntegerParseException(FormatException e) {
396 return CreateFormatException("Couldn't parse integer: " + e.Message);
397 }
398
399 /// <summary>
400 /// Constructs an appropriate FormatException for the given existing exception
401 /// when trying to parse a float or double.
402 /// </summary>
403 private FormatException CreateFloatParseException(Exception e) {
404 return CreateFormatException("Couldn't parse number: " + e.Message);
405 }
406 }
407}