blob: d53ae596a578e29259ac7be1b1672cd35af896fb [file] [log] [blame]
Jon Skeet68036862008-10-22 13:30:34 +01001// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.
3// http://code.google.com/p/protobuf/
4//
5// Licensed under the Apache License, Version 2.0 (the "License");
6// you may not use this file except in compliance with the License.
7// You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16using System;
17using System.Globalization;
18using System.Text.RegularExpressions;
19
20namespace Google.ProtocolBuffers {
21 /// <summary>
22 /// Represents a stream of tokens parsed from a string.
23 /// </summary>
24 internal sealed class TextTokenizer {
25 private readonly string text;
26 private string currentToken;
27
28 /// <summary>
29 /// The character index within the text to perform the next regex match at.
30 /// </summary>
31 private int matchPos = 0;
32
33 /// <summary>
34 /// The character index within the text at which the current token begins.
35 /// </summary>
36 private int pos = 0;
37
38 /// <summary>
39 /// The line number of the current token.
40 /// </summary>
41 private int line = 0;
42 /// <summary>
43 /// The column number of the current token.
44 /// </summary>
45 private int column = 0;
46
47 /// <summary>
48 /// The line number of the previous token.
49 /// </summary>
50 private int previousLine = 0;
51 /// <summary>
52 /// The column number of the previous token.
53 /// </summary>
54 private int previousColumn = 0;
55
56 private static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(\\s|(#.*$))+",
57 RegexOptions.Compiled | RegexOptions.Multiline);
58 private static readonly Regex TokenPattern = new Regex(
59 "\\G[a-zA-Z_][0-9a-zA-Z_+-]*|" + // an identifier
60 "\\G[0-9+-][0-9a-zA-Z_.+-]*|" + // a number
61 "\\G\"([^\"\\\n\\\\]|\\\\.)*(\"|\\\\?$)|" + // a double-quoted string
62 "\\G\'([^\"\\\n\\\\]|\\\\.)*(\'|\\\\?$)", // a single-quoted string
63 RegexOptions.Compiled | RegexOptions.Multiline);
64
65 private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
66 private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
67 private static readonly Regex FloatNan = new Regex("^nanf?$", RegexOptions.Compiled | RegexOptions.IgnoreCase);
68
69 /** Construct a tokenizer that parses tokens from the given text. */
70 public TextTokenizer(string text) {
71 this.text = text;
72 SkipWhitespace();
73 NextToken();
74 }
75
76 /// <summary>
77 /// Are we at the end of the input?
78 /// </summary>
79 public bool AtEnd {
80 get { return currentToken.Length == 0; }
81 }
82
83 /// <summary>
84 /// Advances to the next token.
85 /// </summary>
86 public void NextToken() {
87 previousLine = line;
88 previousColumn = column;
89
90 // Advance the line counter to the current position.
91 while (pos < matchPos) {
92 if (text[pos] == '\n') {
93 ++line;
94 column = 0;
95 } else {
96 ++column;
97 }
98 ++pos;
99 }
100
101 // Match the next token.
102 if (matchPos == text.Length) {
103 // EOF
104 currentToken = "";
105 } else {
106 Match match = TokenPattern.Match(text, matchPos);
107 if (match.Success) {
108 currentToken = match.Value;
109 matchPos += match.Length;
110 } else {
111 // Take one character.
112 currentToken = text[matchPos].ToString();
113 matchPos++;
114 }
115
116 SkipWhitespace();
117 }
118 }
119
120 /// <summary>
121 /// Skip over any whitespace so that matchPos starts at the next token.
122 /// </summary>
123 private void SkipWhitespace() {
124 Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
125 if (match.Success) {
126 matchPos += match.Length;
127 }
128 }
129
130 /// <summary>
131 /// If the next token exactly matches the given token, consume it and return
132 /// true. Otherwise, return false without doing anything.
133 /// </summary>
134 public bool TryConsume(string token) {
135 if (currentToken == token) {
136 NextToken();
137 return true;
138 }
139 return false;
140 }
141
142 /*
143 * If the next token exactly matches {@code token}, consume it. Otherwise,
144 * throw a {@link ParseException}.
145 */
146 /// <summary>
147 /// If the next token exactly matches the specified one, consume it.
148 /// Otherwise, throw a FormatException.
149 /// </summary>
150 /// <param name="token"></param>
151 public void Consume(string token) {
152 if (!TryConsume(token)) {
153 throw CreateFormatException("Expected \"" + token + "\".");
154 }
155 }
156
157 /// <summary>
158 /// Returns true if the next token is an integer, but does not consume it.
159 /// </summary>
160 public bool LookingAtInteger() {
161 if (currentToken.Length == 0) {
162 return false;
163 }
164
165 char c = currentToken[0];
166 return ('0' <= c && c <= '9') || c == '-' || c == '+';
167 }
168
169 /// <summary>
170 /// If the next token is an identifier, consume it and return its value.
171 /// Otherwise, throw a FormatException.
172 /// </summary>
173 public string ConsumeIdentifier() {
174 foreach (char c in currentToken) {
175 if (('a' <= c && c <= 'z') ||
176 ('A' <= c && c <= 'Z') ||
177 ('0' <= c && c <= '9') ||
178 (c == '_') || (c == '.')) {
179 // OK
180 } else {
181 throw CreateFormatException("Expected identifier.");
182 }
183 }
184
185 string result = currentToken;
186 NextToken();
187 return result;
188 }
189
190 /// <summary>
191 /// If the next token is a 32-bit signed integer, consume it and return its
192 /// value. Otherwise, throw a FormatException.
193 /// </summary>
194 public int ConsumeInt32() {
195 try {
196 int result = TextFormat.ParseInt32(currentToken);
197 NextToken();
198 return result;
199 } catch (FormatException e) {
200 throw CreateIntegerParseException(e);
201 }
202 }
203
204 /// <summary>
205 /// If the next token is a 32-bit unsigned integer, consume it and return its
206 /// value. Otherwise, throw a FormatException.
207 /// </summary>
208 public uint ConsumeUInt32() {
209 try {
210 uint result = TextFormat.ParseUInt32(currentToken);
211 NextToken();
212 return result;
213 } catch (FormatException e) {
214 throw CreateIntegerParseException(e);
215 }
216 }
217
218 /// <summary>
219 /// If the next token is a 64-bit signed integer, consume it and return its
220 /// value. Otherwise, throw a FormatException.
221 /// </summary>
222 public long ConsumeInt64() {
223 try {
224 long result = TextFormat.ParseInt64(currentToken);
225 NextToken();
226 return result;
227 } catch (FormatException e) {
228 throw CreateIntegerParseException(e);
229 }
230 }
231
232 /// <summary>
233 /// If the next token is a 64-bit unsigned integer, consume it and return its
234 /// value. Otherwise, throw a FormatException.
235 /// </summary>
236 public ulong ConsumeUInt64() {
237 try {
238 ulong result = TextFormat.ParseUInt64(currentToken);
239 NextToken();
240 return result;
241 } catch (FormatException e) {
242 throw CreateIntegerParseException(e);
243 }
244 }
245
246 /// <summary>
247 /// If the next token is a double, consume it and return its value.
248 /// Otherwise, throw a FormatException.
249 /// </summary>
250 public double ConsumeDouble() {
251 // We need to parse infinity and nan separately because
252 // double.Parse() does not accept "inf", "infinity", or "nan".
253 if (DoubleInfinity.IsMatch(currentToken)) {
254 bool negative = currentToken.StartsWith("-");
255 NextToken();
256 return negative ? double.NegativeInfinity : double.PositiveInfinity;
257 }
258 if (currentToken.Equals("nan", StringComparison.InvariantCultureIgnoreCase)) {
259 NextToken();
260 return Double.NaN;
261 }
262
263 try {
264 double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
265 NextToken();
266 return result;
267 } catch (FormatException e) {
268 throw CreateFloatParseException(e);
269 } catch (OverflowException e) {
270 throw CreateFloatParseException(e);
271 }
272 }
273
274 /// <summary>
275 /// If the next token is a float, consume it and return its value.
276 /// Otherwise, throw a FormatException.
277 /// </summary>
278 public float ConsumeFloat() {
279
280 // We need to parse infinity and nan separately because
281 // Float.parseFloat() does not accept "inf", "infinity", or "nan".
282 if (FloatInfinity.IsMatch(currentToken)) {
283 bool negative = currentToken.StartsWith("-");
284 NextToken();
285 return negative ? float.NegativeInfinity : float.PositiveInfinity;
286 }
287 if (FloatNan.IsMatch(currentToken)) {
288 NextToken();
289 return float.NaN;
290 }
291
292 if (currentToken.EndsWith("f")) {
293 currentToken = currentToken.TrimEnd('f');
294 }
295
296 try {
297 float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
298 NextToken();
299 return result;
300 } catch (FormatException e) {
301 throw CreateFloatParseException(e);
302 } catch (OverflowException e) {
303 throw CreateFloatParseException(e);
304 }
305 }
306
307 /// <summary>
308 /// If the next token is a Boolean, consume it and return its value.
309 /// Otherwise, throw a FormatException.
310 /// </summary>
311 public bool ConsumeBoolean() {
312 if (currentToken == "true") {
313 NextToken();
314 return true;
315 }
316 if (currentToken == "false") {
317 NextToken();
318 return false;
319 }
320 throw CreateFormatException("Expected \"true\" or \"false\".");
321 }
322
323 /// <summary>
324 /// If the next token is a string, consume it and return its (unescaped) value.
325 /// Otherwise, throw a FormatException.
326 /// </summary>
327 public string ConsumeString() {
328 return ConsumeByteString().ToStringUtf8();
329 }
330
331 /// <summary>
332 /// If the next token is a string, consume it, unescape it as a
333 /// ByteString and return it. Otherwise, throw a FormatException.
334 /// </summary>
335 public ByteString ConsumeByteString() {
336 char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
337 if (quote != '\"' && quote != '\'') {
338 throw CreateFormatException("Expected string.");
339 }
340
341 if (currentToken.Length < 2 ||
342 currentToken[currentToken.Length-1] != quote) {
343 throw CreateFormatException("String missing ending quote.");
344 }
345
346 try {
347 string escaped = currentToken.Substring(1, currentToken.Length - 2);
348 ByteString result = TextFormat.UnescapeBytes(escaped);
349 NextToken();
350 return result;
351 } catch (FormatException e) {
352 throw CreateFormatException(e.Message);
353 }
354 }
355
356 /// <summary>
357 /// Returns a format exception with the current line and column numbers
358 /// in the description, suitable for throwing.
359 /// </summary>
360 public FormatException CreateFormatException(string description) {
361 // Note: People generally prefer one-based line and column numbers.
362 return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
363 }
364
365 /// <summary>
366 /// Returns a format exception with the line and column numbers of the
367 /// previous token in the description, suitable for throwing.
368 /// </summary>
369 public FormatException CreateFormatExceptionPreviousToken(string description) {
370 // Note: People generally prefer one-based line and column numbers.
371 return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
372 }
373
374 /// <summary>
375 /// Constructs an appropriate FormatException for the given existing exception
376 /// when trying to parse an integer.
377 /// </summary>
378 private FormatException CreateIntegerParseException(FormatException e) {
379 return CreateFormatException("Couldn't parse integer: " + e.Message);
380 }
381
382 /// <summary>
383 /// Constructs an appropriate FormatException for the given existing exception
384 /// when trying to parse a float or double.
385 /// </summary>
386 private FormatException CreateFloatParseException(Exception e) {
387 return CreateFormatException("Couldn't parse number: " + e.Message);
388 }
389 }
390}