blob: d25a5874e6f4db144767dd615c47920b107fe6a3 [file] [log] [blame]
Jon Skeet60c059b2008-10-23 21:17:56 +01001// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc. All rights reserved.
3// http://github.com/jskeet/dotnet-protobufs/
4// Original C++/Java/Python code:
Jon Skeet68036862008-10-22 13:30:34 +01005// http://code.google.com/p/protobuf/
6//
Jon Skeet60c059b2008-10-23 21:17:56 +01007// Redistribution and use in source and binary forms, with or without
8// modification, are permitted provided that the following conditions are
9// met:
Jon Skeet68036862008-10-22 13:30:34 +010010//
Jon Skeet60c059b2008-10-23 21:17:56 +010011// * Redistributions of source code must retain the above copyright
12// notice, this list of conditions and the following disclaimer.
13// * Redistributions in binary form must reproduce the above
14// copyright notice, this list of conditions and the following disclaimer
15// in the documentation and/or other materials provided with the
16// distribution.
17// * Neither the name of Google Inc. nor the names of its
18// contributors may be used to endorse or promote products derived from
19// this software without specific prior written permission.
Jon Skeet68036862008-10-22 13:30:34 +010020//
Jon Skeet60c059b2008-10-23 21:17:56 +010021// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Jon Skeet68036862008-10-22 13:30:34 +010032using System;
33using System.Globalization;
34using System.Text.RegularExpressions;
35
36namespace Google.ProtocolBuffers {
37 /// <summary>
38 /// Represents a stream of tokens parsed from a string.
39 /// </summary>
40 internal sealed class TextTokenizer {
41 private readonly string text;
42 private string currentToken;
43
44 /// <summary>
45 /// The character index within the text to perform the next regex match at.
46 /// </summary>
47 private int matchPos = 0;
48
49 /// <summary>
50 /// The character index within the text at which the current token begins.
51 /// </summary>
52 private int pos = 0;
53
54 /// <summary>
55 /// The line number of the current token.
56 /// </summary>
57 private int line = 0;
58 /// <summary>
59 /// The column number of the current token.
60 /// </summary>
61 private int column = 0;
62
63 /// <summary>
64 /// The line number of the previous token.
65 /// </summary>
66 private int previousLine = 0;
67 /// <summary>
68 /// The column number of the previous token.
69 /// </summary>
70 private int previousColumn = 0;
71
Jon Skeet60fb63e2009-06-20 20:46:28 +010072#if SILVERLIGHT
73 private const RegexOptions CompiledRegexWhereAvailable = RegexOptions.None;
74#else
75 private const RegexOptions CompiledRegexWhereAvailable = RegexOptions.Compiled;
76#endif
77
Jon Skeet0ca3fec2009-01-27 14:56:10 +000078 // Note: atomic groups used to mimic possessive quantifiers in Java in both of these regexes
79 private static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(?>(\\s|(#.*$))+)",
Jon Skeet60fb63e2009-06-20 20:46:28 +010080 CompiledRegexWhereAvailable | RegexOptions.Multiline);
Jon Skeet68036862008-10-22 13:30:34 +010081 private static readonly Regex TokenPattern = new Regex(
Jon Skeet0ca3fec2009-01-27 14:56:10 +000082 "\\G[a-zA-Z_](?>[0-9a-zA-Z_+-]*)|" + // an identifier
83 "\\G[0-9+-](?>[0-9a-zA-Z_.+-]*)|" + // a number
84 "\\G\"(?>([^\"\\\n\\\\]|\\\\.)*)(\"|\\\\?$)|" + // a double-quoted string
85 "\\G\'(?>([^\"\\\n\\\\]|\\\\.)*)(\'|\\\\?$)", // a single-quoted string
Jon Skeet68036862008-10-22 13:30:34 +010086 RegexOptions.Compiled | RegexOptions.Multiline);
87
Jon Skeet60fb63e2009-06-20 20:46:28 +010088 private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$", CompiledRegexWhereAvailable | RegexOptions.IgnoreCase);
89 private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$", CompiledRegexWhereAvailable | RegexOptions.IgnoreCase);
90 private static readonly Regex FloatNan = new Regex("^nanf?$", CompiledRegexWhereAvailable | RegexOptions.IgnoreCase);
Jon Skeet68036862008-10-22 13:30:34 +010091
92 /** Construct a tokenizer that parses tokens from the given text. */
93 public TextTokenizer(string text) {
94 this.text = text;
95 SkipWhitespace();
96 NextToken();
97 }
98
99 /// <summary>
100 /// Are we at the end of the input?
101 /// </summary>
102 public bool AtEnd {
103 get { return currentToken.Length == 0; }
104 }
105
106 /// <summary>
107 /// Advances to the next token.
108 /// </summary>
109 public void NextToken() {
110 previousLine = line;
111 previousColumn = column;
112
113 // Advance the line counter to the current position.
114 while (pos < matchPos) {
115 if (text[pos] == '\n') {
116 ++line;
117 column = 0;
118 } else {
119 ++column;
120 }
121 ++pos;
122 }
123
124 // Match the next token.
125 if (matchPos == text.Length) {
126 // EOF
127 currentToken = "";
128 } else {
129 Match match = TokenPattern.Match(text, matchPos);
130 if (match.Success) {
131 currentToken = match.Value;
132 matchPos += match.Length;
133 } else {
134 // Take one character.
135 currentToken = text[matchPos].ToString();
136 matchPos++;
137 }
138
139 SkipWhitespace();
140 }
141 }
142
143 /// <summary>
144 /// Skip over any whitespace so that matchPos starts at the next token.
145 /// </summary>
146 private void SkipWhitespace() {
147 Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
148 if (match.Success) {
149 matchPos += match.Length;
150 }
151 }
152
153 /// <summary>
154 /// If the next token exactly matches the given token, consume it and return
155 /// true. Otherwise, return false without doing anything.
156 /// </summary>
157 public bool TryConsume(string token) {
158 if (currentToken == token) {
159 NextToken();
160 return true;
161 }
162 return false;
163 }
164
165 /*
166 * If the next token exactly matches {@code token}, consume it. Otherwise,
167 * throw a {@link ParseException}.
168 */
169 /// <summary>
170 /// If the next token exactly matches the specified one, consume it.
171 /// Otherwise, throw a FormatException.
172 /// </summary>
173 /// <param name="token"></param>
174 public void Consume(string token) {
175 if (!TryConsume(token)) {
176 throw CreateFormatException("Expected \"" + token + "\".");
177 }
178 }
179
180 /// <summary>
181 /// Returns true if the next token is an integer, but does not consume it.
182 /// </summary>
183 public bool LookingAtInteger() {
184 if (currentToken.Length == 0) {
185 return false;
186 }
187
188 char c = currentToken[0];
189 return ('0' <= c && c <= '9') || c == '-' || c == '+';
190 }
191
192 /// <summary>
193 /// If the next token is an identifier, consume it and return its value.
194 /// Otherwise, throw a FormatException.
195 /// </summary>
196 public string ConsumeIdentifier() {
197 foreach (char c in currentToken) {
198 if (('a' <= c && c <= 'z') ||
199 ('A' <= c && c <= 'Z') ||
200 ('0' <= c && c <= '9') ||
201 (c == '_') || (c == '.')) {
202 // OK
203 } else {
204 throw CreateFormatException("Expected identifier.");
205 }
206 }
207
208 string result = currentToken;
209 NextToken();
210 return result;
211 }
212
213 /// <summary>
214 /// If the next token is a 32-bit signed integer, consume it and return its
215 /// value. Otherwise, throw a FormatException.
216 /// </summary>
217 public int ConsumeInt32() {
218 try {
219 int result = TextFormat.ParseInt32(currentToken);
220 NextToken();
221 return result;
222 } catch (FormatException e) {
223 throw CreateIntegerParseException(e);
224 }
225 }
226
227 /// <summary>
228 /// If the next token is a 32-bit unsigned integer, consume it and return its
229 /// value. Otherwise, throw a FormatException.
230 /// </summary>
231 public uint ConsumeUInt32() {
232 try {
233 uint result = TextFormat.ParseUInt32(currentToken);
234 NextToken();
235 return result;
236 } catch (FormatException e) {
237 throw CreateIntegerParseException(e);
238 }
239 }
240
241 /// <summary>
242 /// If the next token is a 64-bit signed integer, consume it and return its
243 /// value. Otherwise, throw a FormatException.
244 /// </summary>
245 public long ConsumeInt64() {
246 try {
247 long result = TextFormat.ParseInt64(currentToken);
248 NextToken();
249 return result;
250 } catch (FormatException e) {
251 throw CreateIntegerParseException(e);
252 }
253 }
254
255 /// <summary>
256 /// If the next token is a 64-bit unsigned integer, consume it and return its
257 /// value. Otherwise, throw a FormatException.
258 /// </summary>
259 public ulong ConsumeUInt64() {
260 try {
261 ulong result = TextFormat.ParseUInt64(currentToken);
262 NextToken();
263 return result;
264 } catch (FormatException e) {
265 throw CreateIntegerParseException(e);
266 }
267 }
268
269 /// <summary>
270 /// If the next token is a double, consume it and return its value.
271 /// Otherwise, throw a FormatException.
272 /// </summary>
273 public double ConsumeDouble() {
274 // We need to parse infinity and nan separately because
275 // double.Parse() does not accept "inf", "infinity", or "nan".
276 if (DoubleInfinity.IsMatch(currentToken)) {
277 bool negative = currentToken.StartsWith("-");
278 NextToken();
279 return negative ? double.NegativeInfinity : double.PositiveInfinity;
280 }
281 if (currentToken.Equals("nan", StringComparison.InvariantCultureIgnoreCase)) {
282 NextToken();
283 return Double.NaN;
284 }
285
286 try {
287 double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
288 NextToken();
289 return result;
290 } catch (FormatException e) {
291 throw CreateFloatParseException(e);
292 } catch (OverflowException e) {
293 throw CreateFloatParseException(e);
294 }
295 }
296
297 /// <summary>
298 /// If the next token is a float, consume it and return its value.
299 /// Otherwise, throw a FormatException.
300 /// </summary>
301 public float ConsumeFloat() {
Jon Skeet68036862008-10-22 13:30:34 +0100302 // We need to parse infinity and nan separately because
303 // Float.parseFloat() does not accept "inf", "infinity", or "nan".
304 if (FloatInfinity.IsMatch(currentToken)) {
305 bool negative = currentToken.StartsWith("-");
306 NextToken();
307 return negative ? float.NegativeInfinity : float.PositiveInfinity;
308 }
309 if (FloatNan.IsMatch(currentToken)) {
310 NextToken();
311 return float.NaN;
312 }
313
314 if (currentToken.EndsWith("f")) {
315 currentToken = currentToken.TrimEnd('f');
316 }
317
318 try {
319 float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
320 NextToken();
321 return result;
322 } catch (FormatException e) {
323 throw CreateFloatParseException(e);
324 } catch (OverflowException e) {
325 throw CreateFloatParseException(e);
326 }
327 }
328
329 /// <summary>
330 /// If the next token is a Boolean, consume it and return its value.
331 /// Otherwise, throw a FormatException.
332 /// </summary>
333 public bool ConsumeBoolean() {
334 if (currentToken == "true") {
335 NextToken();
336 return true;
337 }
338 if (currentToken == "false") {
339 NextToken();
340 return false;
341 }
342 throw CreateFormatException("Expected \"true\" or \"false\".");
343 }
344
345 /// <summary>
346 /// If the next token is a string, consume it and return its (unescaped) value.
347 /// Otherwise, throw a FormatException.
348 /// </summary>
349 public string ConsumeString() {
350 return ConsumeByteString().ToStringUtf8();
351 }
352
353 /// <summary>
354 /// If the next token is a string, consume it, unescape it as a
355 /// ByteString and return it. Otherwise, throw a FormatException.
356 /// </summary>
357 public ByteString ConsumeByteString() {
358 char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
359 if (quote != '\"' && quote != '\'') {
360 throw CreateFormatException("Expected string.");
361 }
362
363 if (currentToken.Length < 2 ||
364 currentToken[currentToken.Length-1] != quote) {
365 throw CreateFormatException("String missing ending quote.");
366 }
367
368 try {
369 string escaped = currentToken.Substring(1, currentToken.Length - 2);
370 ByteString result = TextFormat.UnescapeBytes(escaped);
371 NextToken();
372 return result;
373 } catch (FormatException e) {
374 throw CreateFormatException(e.Message);
375 }
376 }
377
378 /// <summary>
379 /// Returns a format exception with the current line and column numbers
380 /// in the description, suitable for throwing.
381 /// </summary>
382 public FormatException CreateFormatException(string description) {
383 // Note: People generally prefer one-based line and column numbers.
384 return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
385 }
386
387 /// <summary>
388 /// Returns a format exception with the line and column numbers of the
389 /// previous token in the description, suitable for throwing.
390 /// </summary>
391 public FormatException CreateFormatExceptionPreviousToken(string description) {
392 // Note: People generally prefer one-based line and column numbers.
393 return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
394 }
395
396 /// <summary>
397 /// Constructs an appropriate FormatException for the given existing exception
398 /// when trying to parse an integer.
399 /// </summary>
400 private FormatException CreateIntegerParseException(FormatException e) {
401 return CreateFormatException("Couldn't parse integer: " + e.Message);
402 }
403
404 /// <summary>
405 /// Constructs an appropriate FormatException for the given existing exception
406 /// when trying to parse a float or double.
407 /// </summary>
408 private FormatException CreateFloatParseException(Exception e) {
409 return CreateFormatException("Couldn't parse number: " + e.Message);
410 }
411 }
412}