blob: 6d77b5432fe3d2426c9a1665609707d7b8300cc8 [file] [log] [blame]
temporal40ee5512008-07-10 02:12:20 +00001# Protocol Buffers - Google's data interchange format
kenton@google.com24bf56f2008-09-24 20:31:01 +00002# Copyright 2008 Google Inc. All rights reserved.
temporal40ee5512008-07-10 02:12:20 +00003# http://code.google.com/p/protobuf/
4#
kenton@google.com24bf56f2008-09-24 20:31:01 +00005# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met:
temporal40ee5512008-07-10 02:12:20 +00008#
kenton@google.com24bf56f2008-09-24 20:31:01 +00009# * Redistributions of source code must retain the above copyright
10# notice, this list of conditions and the following disclaimer.
11# * Redistributions in binary form must reproduce the above
12# copyright notice, this list of conditions and the following disclaimer
13# in the documentation and/or other materials provided with the
14# distribution.
15# * Neither the name of Google Inc. nor the names of its
16# contributors may be used to endorse or promote products derived from
17# this software without specific prior written permission.
temporal40ee5512008-07-10 02:12:20 +000018#
kenton@google.com24bf56f2008-09-24 20:31:01 +000019# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
temporal40ee5512008-07-10 02:12:20 +000030
31"""Contains routines for printing protocol messages in text format."""
32
33__author__ = 'kenton@google.com (Kenton Varda)'
34
35import cStringIO
kenton@google.com80b1d622009-07-29 01:13:20 +000036import re
temporal40ee5512008-07-10 02:12:20 +000037
kenton@google.com80b1d622009-07-29 01:13:20 +000038from collections import deque
39from google.protobuf.internal import type_checkers
temporal40ee5512008-07-10 02:12:20 +000040from google.protobuf import descriptor
41
kenton@google.com80b1d622009-07-29 01:13:20 +000042__all__ = [ 'MessageToString', 'PrintMessage', 'PrintField',
43 'PrintFieldValue', 'Merge' ]
44
45
kenton@google.comd0047c42009-12-23 02:01:01 +000046# Infinity and NaN are not explicitly supported by Python pre-2.6, and
47# float('inf') does not work on Windows (pre-2.6).
kenton@google.com46ed74e2009-12-23 02:08:05 +000048_INFINITY = 1e10000 # overflows, thus will actually be infinity.
kenton@google.comd0047c42009-12-23 02:01:01 +000049_NAN = _INFINITY * 0
50
51
kenton@google.com80b1d622009-07-29 01:13:20 +000052class ParseError(Exception):
53 """Thrown in case of ASCII parsing error."""
54
temporal40ee5512008-07-10 02:12:20 +000055
liujisi@google.com33165fe2010-11-02 13:14:58 +000056def MessageToString(message, as_utf8=False, as_one_line=False):
temporal40ee5512008-07-10 02:12:20 +000057 out = cStringIO.StringIO()
liujisi@google.com33165fe2010-11-02 13:14:58 +000058 PrintMessage(message, out, as_utf8=as_utf8, as_one_line=as_one_line)
temporal40ee5512008-07-10 02:12:20 +000059 result = out.getvalue()
60 out.close()
liujisi@google.com33165fe2010-11-02 13:14:58 +000061 if as_one_line:
62 return result.rstrip()
temporal40ee5512008-07-10 02:12:20 +000063 return result
64
kenton@google.com80b1d622009-07-29 01:13:20 +000065
liujisi@google.com33165fe2010-11-02 13:14:58 +000066def PrintMessage(message, out, indent=0, as_utf8=False, as_one_line=False):
temporal40ee5512008-07-10 02:12:20 +000067 for field, value in message.ListFields():
68 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
69 for element in value:
liujisi@google.com33165fe2010-11-02 13:14:58 +000070 PrintField(field, element, out, indent, as_utf8, as_one_line)
temporal40ee5512008-07-10 02:12:20 +000071 else:
liujisi@google.com33165fe2010-11-02 13:14:58 +000072 PrintField(field, value, out, indent, as_utf8, as_one_line)
temporal40ee5512008-07-10 02:12:20 +000073
kenton@google.com80b1d622009-07-29 01:13:20 +000074
liujisi@google.com33165fe2010-11-02 13:14:58 +000075def PrintField(field, value, out, indent=0, as_utf8=False, as_one_line=False):
temporal40ee5512008-07-10 02:12:20 +000076 """Print a single field name/value pair. For repeated fields, the value
77 should be a single element."""
78
79 out.write(' ' * indent);
80 if field.is_extension:
81 out.write('[')
82 if (field.containing_type.GetOptions().message_set_wire_format and
83 field.type == descriptor.FieldDescriptor.TYPE_MESSAGE and
84 field.message_type == field.extension_scope and
85 field.label == descriptor.FieldDescriptor.LABEL_OPTIONAL):
86 out.write(field.message_type.full_name)
87 else:
88 out.write(field.full_name)
89 out.write(']')
90 elif field.type == descriptor.FieldDescriptor.TYPE_GROUP:
91 # For groups, use the capitalized name.
92 out.write(field.message_type.name)
93 else:
94 out.write(field.name)
95
96 if field.cpp_type != descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
97 # The colon is optional in this case, but our cross-language golden files
98 # don't include it.
99 out.write(': ')
100
liujisi@google.com33165fe2010-11-02 13:14:58 +0000101 PrintFieldValue(field, value, out, indent, as_utf8, as_one_line)
102 if as_one_line:
103 out.write(' ')
104 else:
105 out.write('\n')
temporal40ee5512008-07-10 02:12:20 +0000106
kenton@google.com80b1d622009-07-29 01:13:20 +0000107
liujisi@google.com33165fe2010-11-02 13:14:58 +0000108def PrintFieldValue(field, value, out, indent=0,
109 as_utf8=False, as_one_line=False):
temporal40ee5512008-07-10 02:12:20 +0000110 """Print a single field value (not including name). For repeated fields,
111 the value should be a single element."""
112
113 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
liujisi@google.com33165fe2010-11-02 13:14:58 +0000114 if as_one_line:
115 out.write(' { ')
116 PrintMessage(value, out, indent, as_utf8, as_one_line)
117 out.write('}')
118 else:
119 out.write(' {\n')
120 PrintMessage(value, out, indent + 2, as_utf8, as_one_line)
121 out.write(' ' * indent + '}')
temporal40ee5512008-07-10 02:12:20 +0000122 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_ENUM:
123 out.write(field.enum_type.values_by_number[value].name)
124 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_STRING:
125 out.write('\"')
liujisi@google.com33165fe2010-11-02 13:14:58 +0000126 if type(value) is unicode:
127 out.write(_CEscape(value.encode('utf-8'), as_utf8))
128 else:
129 out.write(_CEscape(value, as_utf8))
temporal40ee5512008-07-10 02:12:20 +0000130 out.write('\"')
131 elif field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_BOOL:
132 if value:
133 out.write("true")
134 else:
135 out.write("false")
136 else:
137 out.write(str(value))
138
kenton@google.com80b1d622009-07-29 01:13:20 +0000139
140def Merge(text, message):
141 """Merges an ASCII representation of a protocol message into a message.
142
143 Args:
144 text: Message ASCII representation.
145 message: A protocol buffer message to merge into.
146
147 Raises:
148 ParseError: On ASCII parsing problems.
149 """
150 tokenizer = _Tokenizer(text)
151 while not tokenizer.AtEnd():
152 _MergeField(tokenizer, message)
153
154
155def _MergeField(tokenizer, message):
156 """Merges a single protocol message field into a message.
157
158 Args:
159 tokenizer: A tokenizer to parse the field name and values.
160 message: A protocol message to record the data.
161
162 Raises:
163 ParseError: In case of ASCII parsing problems.
164 """
165 message_descriptor = message.DESCRIPTOR
166 if tokenizer.TryConsume('['):
167 name = [tokenizer.ConsumeIdentifier()]
168 while tokenizer.TryConsume('.'):
169 name.append(tokenizer.ConsumeIdentifier())
170 name = '.'.join(name)
171
kenton@google.comfccb1462009-12-18 02:11:36 +0000172 if not message_descriptor.is_extendable:
173 raise tokenizer.ParseErrorPreviousToken(
174 'Message type "%s" does not have extensions.' %
175 message_descriptor.full_name)
kenton@google.com80b1d622009-07-29 01:13:20 +0000176 field = message.Extensions._FindExtensionByName(name)
177 if not field:
178 raise tokenizer.ParseErrorPreviousToken(
179 'Extension "%s" not registered.' % name)
180 elif message_descriptor != field.containing_type:
181 raise tokenizer.ParseErrorPreviousToken(
182 'Extension "%s" does not extend message type "%s".' % (
183 name, message_descriptor.full_name))
184 tokenizer.Consume(']')
185 else:
186 name = tokenizer.ConsumeIdentifier()
187 field = message_descriptor.fields_by_name.get(name, None)
188
189 # Group names are expected to be capitalized as they appear in the
190 # .proto file, which actually matches their type names, not their field
191 # names.
192 if not field:
193 field = message_descriptor.fields_by_name.get(name.lower(), None)
194 if field and field.type != descriptor.FieldDescriptor.TYPE_GROUP:
195 field = None
196
197 if (field and field.type == descriptor.FieldDescriptor.TYPE_GROUP and
198 field.message_type.name != name):
199 field = None
200
201 if not field:
202 raise tokenizer.ParseErrorPreviousToken(
203 'Message type "%s" has no field named "%s".' % (
204 message_descriptor.full_name, name))
205
206 if field.cpp_type == descriptor.FieldDescriptor.CPPTYPE_MESSAGE:
207 tokenizer.TryConsume(':')
208
209 if tokenizer.TryConsume('<'):
210 end_token = '>'
211 else:
212 tokenizer.Consume('{')
213 end_token = '}'
214
215 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
216 if field.is_extension:
217 sub_message = message.Extensions[field].add()
218 else:
219 sub_message = getattr(message, field.name).add()
220 else:
221 if field.is_extension:
222 sub_message = message.Extensions[field]
223 else:
224 sub_message = getattr(message, field.name)
kenton@google.comfccb1462009-12-18 02:11:36 +0000225 sub_message.SetInParent()
kenton@google.com80b1d622009-07-29 01:13:20 +0000226
227 while not tokenizer.TryConsume(end_token):
228 if tokenizer.AtEnd():
229 raise tokenizer.ParseErrorPreviousToken('Expected "%s".' % (end_token))
230 _MergeField(tokenizer, sub_message)
231 else:
232 _MergeScalarField(tokenizer, message, field)
233
234
235def _MergeScalarField(tokenizer, message, field):
236 """Merges a single protocol message scalar field into a message.
237
238 Args:
239 tokenizer: A tokenizer to parse the field value.
240 message: A protocol message to record the data.
241 field: The descriptor of the field to be merged.
242
243 Raises:
244 ParseError: In case of ASCII parsing problems.
245 RuntimeError: On runtime errors.
246 """
247 tokenizer.Consume(':')
248 value = None
249
250 if field.type in (descriptor.FieldDescriptor.TYPE_INT32,
251 descriptor.FieldDescriptor.TYPE_SINT32,
252 descriptor.FieldDescriptor.TYPE_SFIXED32):
253 value = tokenizer.ConsumeInt32()
254 elif field.type in (descriptor.FieldDescriptor.TYPE_INT64,
255 descriptor.FieldDescriptor.TYPE_SINT64,
256 descriptor.FieldDescriptor.TYPE_SFIXED64):
257 value = tokenizer.ConsumeInt64()
258 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT32,
259 descriptor.FieldDescriptor.TYPE_FIXED32):
260 value = tokenizer.ConsumeUint32()
261 elif field.type in (descriptor.FieldDescriptor.TYPE_UINT64,
262 descriptor.FieldDescriptor.TYPE_FIXED64):
263 value = tokenizer.ConsumeUint64()
264 elif field.type in (descriptor.FieldDescriptor.TYPE_FLOAT,
265 descriptor.FieldDescriptor.TYPE_DOUBLE):
266 value = tokenizer.ConsumeFloat()
267 elif field.type == descriptor.FieldDescriptor.TYPE_BOOL:
268 value = tokenizer.ConsumeBool()
269 elif field.type == descriptor.FieldDescriptor.TYPE_STRING:
270 value = tokenizer.ConsumeString()
271 elif field.type == descriptor.FieldDescriptor.TYPE_BYTES:
272 value = tokenizer.ConsumeByteString()
273 elif field.type == descriptor.FieldDescriptor.TYPE_ENUM:
274 # Enum can be specified by a number (the enum value), or by
275 # a string literal (the enum name).
276 enum_descriptor = field.enum_type
277 if tokenizer.LookingAtInteger():
278 number = tokenizer.ConsumeInt32()
279 enum_value = enum_descriptor.values_by_number.get(number, None)
280 if enum_value is None:
281 raise tokenizer.ParseErrorPreviousToken(
282 'Enum type "%s" has no value with number %d.' % (
283 enum_descriptor.full_name, number))
284 else:
285 identifier = tokenizer.ConsumeIdentifier()
286 enum_value = enum_descriptor.values_by_name.get(identifier, None)
287 if enum_value is None:
288 raise tokenizer.ParseErrorPreviousToken(
289 'Enum type "%s" has no value named %s.' % (
290 enum_descriptor.full_name, identifier))
291 value = enum_value.number
292 else:
293 raise RuntimeError('Unknown field type %d' % field.type)
294
295 if field.label == descriptor.FieldDescriptor.LABEL_REPEATED:
296 if field.is_extension:
297 message.Extensions[field].append(value)
298 else:
299 getattr(message, field.name).append(value)
300 else:
301 if field.is_extension:
302 message.Extensions[field] = value
303 else:
304 setattr(message, field.name, value)
305
306
307class _Tokenizer(object):
308 """Protocol buffer ASCII representation tokenizer.
309
310 This class handles the lower level string parsing by splitting it into
311 meaningful tokens.
312
313 It was directly ported from the Java protocol buffer API.
314 """
315
316 _WHITESPACE = re.compile('(\\s|(#.*$))+', re.MULTILINE)
317 _TOKEN = re.compile(
318 '[a-zA-Z_][0-9a-zA-Z_+-]*|' # an identifier
319 '[0-9+-][0-9a-zA-Z_.+-]*|' # a number
320 '\"([^\"\n\\\\]|\\\\.)*(\"|\\\\?$)|' # a double-quoted string
kenton@google.comeef5f832009-12-23 01:32:45 +0000321 '\'([^\'\n\\\\]|\\\\.)*(\'|\\\\?$)') # a single-quoted string
kenton@google.com80b1d622009-07-29 01:13:20 +0000322 _IDENTIFIER = re.compile('\w+')
323 _INTEGER_CHECKERS = [type_checkers.Uint32ValueChecker(),
324 type_checkers.Int32ValueChecker(),
325 type_checkers.Uint64ValueChecker(),
326 type_checkers.Int64ValueChecker()]
327 _FLOAT_INFINITY = re.compile('-?inf(inity)?f?', re.IGNORECASE)
328 _FLOAT_NAN = re.compile("nanf?", re.IGNORECASE)
329
330 def __init__(self, text_message):
331 self._text_message = text_message
332
333 self._position = 0
334 self._line = -1
335 self._column = 0
336 self._token_start = None
337 self.token = ''
338 self._lines = deque(text_message.split('\n'))
339 self._current_line = ''
340 self._previous_line = 0
341 self._previous_column = 0
342 self._SkipWhitespace()
343 self.NextToken()
344
345 def AtEnd(self):
346 """Checks the end of the text was reached.
347
348 Returns:
349 True iff the end was reached.
350 """
liujisi@google.com33165fe2010-11-02 13:14:58 +0000351 return self.token == ''
kenton@google.com80b1d622009-07-29 01:13:20 +0000352
353 def _PopLine(self):
liujisi@google.com33165fe2010-11-02 13:14:58 +0000354 while len(self._current_line) <= self._column:
kenton@google.com80b1d622009-07-29 01:13:20 +0000355 if not self._lines:
356 self._current_line = ''
357 return
358 self._line += 1
359 self._column = 0
360 self._current_line = self._lines.popleft()
361
362 def _SkipWhitespace(self):
363 while True:
364 self._PopLine()
liujisi@google.com33165fe2010-11-02 13:14:58 +0000365 match = self._WHITESPACE.match(self._current_line, self._column)
kenton@google.com80b1d622009-07-29 01:13:20 +0000366 if not match:
367 break
368 length = len(match.group(0))
kenton@google.com80b1d622009-07-29 01:13:20 +0000369 self._column += length
370
371 def TryConsume(self, token):
372 """Tries to consume a given piece of text.
373
374 Args:
375 token: Text to consume.
376
377 Returns:
378 True iff the text was consumed.
379 """
380 if self.token == token:
381 self.NextToken()
382 return True
383 return False
384
385 def Consume(self, token):
386 """Consumes a piece of text.
387
388 Args:
389 token: Text to consume.
390
391 Raises:
392 ParseError: If the text couldn't be consumed.
393 """
394 if not self.TryConsume(token):
395 raise self._ParseError('Expected "%s".' % token)
396
397 def LookingAtInteger(self):
398 """Checks if the current token is an integer.
399
400 Returns:
401 True iff the current token is an integer.
402 """
403 if not self.token:
404 return False
405 c = self.token[0]
406 return (c >= '0' and c <= '9') or c == '-' or c == '+'
407
408 def ConsumeIdentifier(self):
409 """Consumes protocol message field identifier.
410
411 Returns:
412 Identifier string.
413
414 Raises:
415 ParseError: If an identifier couldn't be consumed.
416 """
417 result = self.token
liujisi@google.com33165fe2010-11-02 13:14:58 +0000418 if not self._IDENTIFIER.match(result):
kenton@google.com80b1d622009-07-29 01:13:20 +0000419 raise self._ParseError('Expected identifier.')
420 self.NextToken()
421 return result
422
423 def ConsumeInt32(self):
424 """Consumes a signed 32bit integer number.
425
426 Returns:
427 The integer parsed.
428
429 Raises:
430 ParseError: If a signed 32bit integer couldn't be consumed.
431 """
432 try:
433 result = self._ParseInteger(self.token, is_signed=True, is_long=False)
434 except ValueError, e:
435 raise self._IntegerParseError(e)
436 self.NextToken()
437 return result
438
439 def ConsumeUint32(self):
440 """Consumes an unsigned 32bit integer number.
441
442 Returns:
443 The integer parsed.
444
445 Raises:
446 ParseError: If an unsigned 32bit integer couldn't be consumed.
447 """
448 try:
449 result = self._ParseInteger(self.token, is_signed=False, is_long=False)
450 except ValueError, e:
451 raise self._IntegerParseError(e)
452 self.NextToken()
453 return result
454
455 def ConsumeInt64(self):
456 """Consumes a signed 64bit integer number.
457
458 Returns:
459 The integer parsed.
460
461 Raises:
462 ParseError: If a signed 64bit integer couldn't be consumed.
463 """
464 try:
465 result = self._ParseInteger(self.token, is_signed=True, is_long=True)
466 except ValueError, e:
467 raise self._IntegerParseError(e)
468 self.NextToken()
469 return result
470
471 def ConsumeUint64(self):
472 """Consumes an unsigned 64bit integer number.
473
474 Returns:
475 The integer parsed.
476
477 Raises:
478 ParseError: If an unsigned 64bit integer couldn't be consumed.
479 """
480 try:
481 result = self._ParseInteger(self.token, is_signed=False, is_long=True)
482 except ValueError, e:
483 raise self._IntegerParseError(e)
484 self.NextToken()
485 return result
486
487 def ConsumeFloat(self):
488 """Consumes an floating point number.
489
490 Returns:
491 The number parsed.
492
493 Raises:
494 ParseError: If a floating point number couldn't be consumed.
495 """
496 text = self.token
liujisi@google.com33165fe2010-11-02 13:14:58 +0000497 if self._FLOAT_INFINITY.match(text):
kenton@google.com80b1d622009-07-29 01:13:20 +0000498 self.NextToken()
499 if text.startswith('-'):
kenton@google.comd0047c42009-12-23 02:01:01 +0000500 return -_INFINITY
501 return _INFINITY
kenton@google.com80b1d622009-07-29 01:13:20 +0000502
liujisi@google.com33165fe2010-11-02 13:14:58 +0000503 if self._FLOAT_NAN.match(text):
kenton@google.com80b1d622009-07-29 01:13:20 +0000504 self.NextToken()
kenton@google.comd0047c42009-12-23 02:01:01 +0000505 return _NAN
kenton@google.com80b1d622009-07-29 01:13:20 +0000506
507 try:
508 result = float(text)
509 except ValueError, e:
510 raise self._FloatParseError(e)
511 self.NextToken()
512 return result
513
514 def ConsumeBool(self):
515 """Consumes a boolean value.
516
517 Returns:
518 The bool parsed.
519
520 Raises:
521 ParseError: If a boolean value couldn't be consumed.
522 """
liujisi@google.com33165fe2010-11-02 13:14:58 +0000523 if self.token in ('true', 't', '1'):
kenton@google.com80b1d622009-07-29 01:13:20 +0000524 self.NextToken()
525 return True
liujisi@google.com33165fe2010-11-02 13:14:58 +0000526 elif self.token in ('false', 'f', '0'):
kenton@google.com80b1d622009-07-29 01:13:20 +0000527 self.NextToken()
528 return False
529 else:
530 raise self._ParseError('Expected "true" or "false".')
531
532 def ConsumeString(self):
533 """Consumes a string value.
534
535 Returns:
536 The string parsed.
537
538 Raises:
539 ParseError: If a string value couldn't be consumed.
540 """
liujisi@google.com33165fe2010-11-02 13:14:58 +0000541 bytes = self.ConsumeByteString()
542 try:
543 return unicode(bytes, 'utf-8')
544 except UnicodeDecodeError, e:
545 raise self._StringParseError(e)
kenton@google.com80b1d622009-07-29 01:13:20 +0000546
547 def ConsumeByteString(self):
548 """Consumes a byte array value.
549
550 Returns:
551 The array parsed (as a string).
552
553 Raises:
554 ParseError: If a byte array value couldn't be consumed.
555 """
kenton@google.com53530182010-01-07 02:08:03 +0000556 list = [self._ConsumeSingleByteString()]
kenton@google.comeef5f832009-12-23 01:32:45 +0000557 while len(self.token) > 0 and self.token[0] in ('\'', '"'):
kenton@google.com53530182010-01-07 02:08:03 +0000558 list.append(self._ConsumeSingleByteString())
kenton@google.comeef5f832009-12-23 01:32:45 +0000559 return "".join(list)
560
kenton@google.com53530182010-01-07 02:08:03 +0000561 def _ConsumeSingleByteString(self):
562 """Consume one token of a string literal.
563
564 String literals (whether bytes or text) can come in multiple adjacent
565 tokens which are automatically concatenated, like in C or Python. This
566 method only consumes one token.
567 """
kenton@google.com80b1d622009-07-29 01:13:20 +0000568 text = self.token
569 if len(text) < 1 or text[0] not in ('\'', '"'):
570 raise self._ParseError('Exptected string.')
571
572 if len(text) < 2 or text[-1] != text[0]:
573 raise self._ParseError('String missing ending quote.')
574
575 try:
576 result = _CUnescape(text[1:-1])
577 except ValueError, e:
578 raise self._ParseError(str(e))
579 self.NextToken()
580 return result
581
582 def _ParseInteger(self, text, is_signed=False, is_long=False):
583 """Parses an integer.
584
585 Args:
586 text: The text to parse.
587 is_signed: True if a signed integer must be parsed.
588 is_long: True if a long integer must be parsed.
589
590 Returns:
591 The integer value.
592
593 Raises:
594 ValueError: Thrown Iff the text is not a valid integer.
595 """
596 pos = 0
597 if text.startswith('-'):
598 pos += 1
599
600 base = 10
601 if text.startswith('0x', pos) or text.startswith('0X', pos):
602 base = 16
603 elif text.startswith('0', pos):
604 base = 8
605
606 # Do the actual parsing. Exception handling is propagated to caller.
607 result = int(text, base)
608
609 # Check if the integer is sane. Exceptions handled by callers.
610 checker = self._INTEGER_CHECKERS[2 * int(is_long) + int(is_signed)]
611 checker.CheckValue(result)
612 return result
613
614 def ParseErrorPreviousToken(self, message):
615 """Creates and *returns* a ParseError for the previously read token.
616
617 Args:
618 message: A message to set for the exception.
619
620 Returns:
621 A ParseError instance.
622 """
623 return ParseError('%d:%d : %s' % (
624 self._previous_line + 1, self._previous_column + 1, message))
625
626 def _ParseError(self, message):
627 """Creates and *returns* a ParseError for the current token."""
628 return ParseError('%d:%d : %s' % (
liujisi@google.com33165fe2010-11-02 13:14:58 +0000629 self._line + 1, self._column - len(self.token) + 1, message))
kenton@google.com80b1d622009-07-29 01:13:20 +0000630
631 def _IntegerParseError(self, e):
632 return self._ParseError('Couldn\'t parse integer: ' + str(e))
633
634 def _FloatParseError(self, e):
635 return self._ParseError('Couldn\'t parse number: ' + str(e))
636
liujisi@google.com33165fe2010-11-02 13:14:58 +0000637 def _StringParseError(self, e):
638 return self._ParseError('Couldn\'t parse string: ' + str(e))
639
kenton@google.com80b1d622009-07-29 01:13:20 +0000640 def NextToken(self):
641 """Reads the next meaningful token."""
642 self._previous_line = self._line
643 self._previous_column = self._column
liujisi@google.com33165fe2010-11-02 13:14:58 +0000644
645 self._column += len(self.token)
646 self._SkipWhitespace()
647
648 if not self._lines and len(self._current_line) <= self._column:
kenton@google.com80b1d622009-07-29 01:13:20 +0000649 self.token = ''
650 return
kenton@google.com80b1d622009-07-29 01:13:20 +0000651
liujisi@google.com33165fe2010-11-02 13:14:58 +0000652 match = self._TOKEN.match(self._current_line, self._column)
kenton@google.com80b1d622009-07-29 01:13:20 +0000653 if match:
654 token = match.group(0)
kenton@google.com80b1d622009-07-29 01:13:20 +0000655 self.token = token
656 else:
liujisi@google.com33165fe2010-11-02 13:14:58 +0000657 self.token = self._current_line[self._column]
kenton@google.com80b1d622009-07-29 01:13:20 +0000658
659
temporal40ee5512008-07-10 02:12:20 +0000660# text.encode('string_escape') does not seem to satisfy our needs as it
661# encodes unprintable characters using two-digit hex escapes whereas our
662# C++ unescaping function allows hex escapes to be any length. So,
663# "\0011".encode('string_escape') ends up being "\\x011", which will be
664# decoded in C++ as a single-character string with char code 0x11.
liujisi@google.com33165fe2010-11-02 13:14:58 +0000665def _CEscape(text, as_utf8):
temporal40ee5512008-07-10 02:12:20 +0000666 def escape(c):
667 o = ord(c)
668 if o == 10: return r"\n" # optional escape
669 if o == 13: return r"\r" # optional escape
670 if o == 9: return r"\t" # optional escape
671 if o == 39: return r"\'" # optional escape
672
673 if o == 34: return r'\"' # necessary escape
674 if o == 92: return r"\\" # necessary escape
675
liujisi@google.com33165fe2010-11-02 13:14:58 +0000676 # necessary escapes
677 if not as_utf8 and (o >= 127 or o < 32): return "\\%03o" % o
temporal40ee5512008-07-10 02:12:20 +0000678 return c
679 return "".join([escape(c) for c in text])
kenton@google.com80b1d622009-07-29 01:13:20 +0000680
681
liujisi@google.com33165fe2010-11-02 13:14:58 +0000682_CUNESCAPE_HEX = re.compile('\\\\x([0-9a-fA-F]{2}|[0-9a-fA-F])')
kenton@google.com80b1d622009-07-29 01:13:20 +0000683
684
685def _CUnescape(text):
686 def ReplaceHex(m):
687 return chr(int(m.group(0)[2:], 16))
688 # This is required because the 'string_escape' encoding doesn't
689 # allow single-digit hex escapes (like '\xf').
690 result = _CUNESCAPE_HEX.sub(ReplaceHex, text)
691 return result.decode('string_escape')