blob: dfcc6284a28b87f1aff0012cc8a66358be1c005e [file] [log] [blame]
Brett Cannon4b964f92008-05-05 20:21:38 +00001"""Implementation of JSONDecoder
2"""
Brett Cannon4b964f92008-05-05 20:21:38 +00003import re
4import sys
Bob Ippolitod914e3f2009-03-17 23:19:00 +00005import struct
Brett Cannon4b964f92008-05-05 20:21:38 +00006
Ezio Melottie3992eb2011-05-14 06:24:53 +03007from json import scanner
Brett Cannon4b964f92008-05-05 20:21:38 +00008try:
9 from _json import scanstring as c_scanstring
10except ImportError:
11 c_scanstring = None
12
13__all__ = ['JSONDecoder']
14
15FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
16
Bob Ippolitod914e3f2009-03-17 23:19:00 +000017def _floatconstants():
18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
19 if sys.byteorder != 'big':
20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
21 nan, inf = struct.unpack('dd', _BYTES)
22 return nan, inf, -inf
23
24NaN, PosInf, NegInf = _floatconstants()
Brett Cannon4b964f92008-05-05 20:21:38 +000025
26
27def linecol(doc, pos):
28 lineno = doc.count('\n', 0, pos) + 1
29 if lineno == 1:
Serhiy Storchaka49d40222013-02-21 20:17:54 +020030 colno = pos + 1
Brett Cannon4b964f92008-05-05 20:21:38 +000031 else:
32 colno = pos - doc.rindex('\n', 0, pos)
33 return lineno, colno
34
35
36def errmsg(msg, doc, pos, end=None):
Bob Ippolitod914e3f2009-03-17 23:19:00 +000037 # Note that this function is called from _json
Brett Cannon4b964f92008-05-05 20:21:38 +000038 lineno, colno = linecol(doc, pos)
39 if end is None:
40 fmt = '{0}: line {1} column {2} (char {3})'
41 return fmt.format(msg, lineno, colno, pos)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000042 #fmt = '%s: line %d column %d (char %d)'
43 #return fmt % (msg, lineno, colno, pos)
Brett Cannon4b964f92008-05-05 20:21:38 +000044 endlineno, endcolno = linecol(doc, end)
45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000047 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Brett Cannon4b964f92008-05-05 20:21:38 +000049
50
51_CONSTANTS = {
52 '-Infinity': NegInf,
53 'Infinity': PosInf,
54 'NaN': NaN,
Brett Cannon4b964f92008-05-05 20:21:38 +000055}
56
Brett Cannon4b964f92008-05-05 20:21:38 +000057STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
58BACKSLASH = {
59 '"': u'"', '\\': u'\\', '/': u'/',
60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
61}
62
63DEFAULT_ENCODING = "utf-8"
64
Bob Ippolitod914e3f2009-03-17 23:19:00 +000065def py_scanstring(s, end, encoding=None, strict=True,
66 _b=BACKSLASH, _m=STRINGCHUNK.match):
67 """Scan the string s for a JSON string. End is the index of the
68 character in s after the quote that started the JSON string.
69 Unescapes all valid JSON string escape sequences and raises ValueError
70 on attempt to decode an invalid string. If strict is False then literal
71 control characters are allowed in the string.
Brett Cannon4b964f92008-05-05 20:21:38 +000072
Bob Ippolitod914e3f2009-03-17 23:19:00 +000073 Returns a tuple of the decoded string and the index of the character in s
74 after the end quote."""
Brett Cannon4b964f92008-05-05 20:21:38 +000075 if encoding is None:
76 encoding = DEFAULT_ENCODING
77 chunks = []
78 _append = chunks.append
79 begin = end - 1
80 while 1:
81 chunk = _m(s, end)
82 if chunk is None:
83 raise ValueError(
84 errmsg("Unterminated string starting at", s, begin))
85 end = chunk.end()
86 content, terminator = chunk.groups()
Bob Ippolitod914e3f2009-03-17 23:19:00 +000087 # Content is contains zero or more unescaped string characters
Brett Cannon4b964f92008-05-05 20:21:38 +000088 if content:
89 if not isinstance(content, unicode):
90 content = unicode(content, encoding)
91 _append(content)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Brett Cannon4b964f92008-05-05 20:21:38 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Bob Ippolitod914e3f2009-03-17 23:19:00 +000098 #msg = "Invalid control character %r at" % (terminator,)
Brett Cannon4b964f92008-05-05 20:21:38 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000109 # If not a unicode escape sequence, must be in the lookup table
Brett Cannon4b964f92008-05-05 20:21:38 +0000110 if esc != 'u':
111 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000112 char = _b[esc]
Brett Cannon4b964f92008-05-05 20:21:38 +0000113 except KeyError:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000114 msg = "Invalid \\escape: " + repr(esc)
Brett Cannon4b964f92008-05-05 20:21:38 +0000115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000118 # Unicode escape sequence
Brett Cannon4b964f92008-05-05 20:21:38 +0000119 esc = s[end + 1:end + 5]
120 next_end = end + 5
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000121 if len(esc) != 4:
122 msg = "Invalid \\uXXXX escape"
Brett Cannon4b964f92008-05-05 20:21:38 +0000123 raise ValueError(errmsg(msg, s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000124 uni = int(esc, 16)
125 # Check for surrogate pair on UCS-4 systems
126 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
127 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
128 if not s[end + 5:end + 7] == '\\u':
129 raise ValueError(errmsg(msg, s, end))
130 esc2 = s[end + 7:end + 11]
131 if len(esc2) != 4:
132 raise ValueError(errmsg(msg, s, end))
133 uni2 = int(esc2, 16)
134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
135 next_end += 6
136 char = unichr(uni)
Brett Cannon4b964f92008-05-05 20:21:38 +0000137 end = next_end
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000138 # Append the unescaped character
139 _append(char)
Brett Cannon4b964f92008-05-05 20:21:38 +0000140 return u''.join(chunks), end
141
142
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000143# Use speedup if available
144scanstring = c_scanstring or py_scanstring
Brett Cannon4b964f92008-05-05 20:21:38 +0000145
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000146WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
147WHITESPACE_STR = ' \t\n\r'
Brett Cannon4b964f92008-05-05 20:21:38 +0000148
Ezio Melottiffd84962010-01-26 15:57:21 +0000149def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000150 object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Ezio Melottiffd84962010-01-26 15:57:21 +0000151 s, end = s_and_end
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000152 pairs = []
153 pairs_append = pairs.append
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000154 # Use a slice to prevent IndexError from being raised, the following
155 # check will raise a more specific ValueError if the string is empty
Brett Cannon4b964f92008-05-05 20:21:38 +0000156 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000157 # Normally we expect nextchar == '"'
Brett Cannon4b964f92008-05-05 20:21:38 +0000158 if nextchar != '"':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000159 if nextchar in _ws:
160 end = _w(s, end).end()
161 nextchar = s[end:end + 1]
162 # Trivial empty object
163 if nextchar == '}':
Ezio Melotti2b96f092011-04-13 05:37:29 +0300164 if object_pairs_hook is not None:
165 result = object_pairs_hook(pairs)
Ezio Melottifda7a8c2013-03-13 01:49:57 +0200166 return result, end + 1
Ezio Melotti2b96f092011-04-13 05:37:29 +0300167 pairs = {}
168 if object_hook is not None:
169 pairs = object_hook(pairs)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000170 return pairs, end + 1
171 elif nextchar != '"':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200172 raise ValueError(errmsg(
173 "Expecting property name enclosed in double quotes", s, end))
Brett Cannon4b964f92008-05-05 20:21:38 +0000174 end += 1
Brett Cannon4b964f92008-05-05 20:21:38 +0000175 while True:
176 key, end = scanstring(s, end, encoding, strict)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000177
178 # To skip some function call overhead we optimize the fast paths where
179 # the JSON key separator is ": " or just ":".
Brett Cannon4b964f92008-05-05 20:21:38 +0000180 if s[end:end + 1] != ':':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000181 end = _w(s, end).end()
182 if s[end:end + 1] != ':':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200183 raise ValueError(errmsg("Expecting ':' delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000184 end += 1
185
Brett Cannon4b964f92008-05-05 20:21:38 +0000186 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000187 if s[end] in _ws:
188 end += 1
189 if s[end] in _ws:
190 end = _w(s, end + 1).end()
191 except IndexError:
192 pass
193
194 try:
195 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000196 except StopIteration:
197 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000198 pairs_append((key, value))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000199
200 try:
201 nextchar = s[end]
202 if nextchar in _ws:
203 end = _w(s, end + 1).end()
204 nextchar = s[end]
205 except IndexError:
206 nextchar = ''
Brett Cannon4b964f92008-05-05 20:21:38 +0000207 end += 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000208
Brett Cannon4b964f92008-05-05 20:21:38 +0000209 if nextchar == '}':
210 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000211 elif nextchar != ',':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200212 raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000213
214 try:
215 nextchar = s[end]
216 if nextchar in _ws:
217 end += 1
218 nextchar = s[end]
219 if nextchar in _ws:
220 end = _w(s, end + 1).end()
221 nextchar = s[end]
222 except IndexError:
223 nextchar = ''
224
Brett Cannon4b964f92008-05-05 20:21:38 +0000225 end += 1
226 if nextchar != '"':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200227 raise ValueError(errmsg(
228 "Expecting property name enclosed in double quotes", s, end - 1))
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000229 if object_pairs_hook is not None:
230 result = object_pairs_hook(pairs)
231 return result, end
232 pairs = dict(pairs)
Brett Cannon4b964f92008-05-05 20:21:38 +0000233 if object_hook is not None:
234 pairs = object_hook(pairs)
235 return pairs, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000236
Ezio Melottiffd84962010-01-26 15:57:21 +0000237def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
238 s, end = s_and_end
Brett Cannon4b964f92008-05-05 20:21:38 +0000239 values = []
Brett Cannon4b964f92008-05-05 20:21:38 +0000240 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000241 if nextchar in _ws:
242 end = _w(s, end + 1).end()
243 nextchar = s[end:end + 1]
244 # Look-ahead for trivial empty array
Brett Cannon4b964f92008-05-05 20:21:38 +0000245 if nextchar == ']':
246 return values, end + 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000247 _append = values.append
Brett Cannon4b964f92008-05-05 20:21:38 +0000248 while True:
249 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000250 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000251 except StopIteration:
252 raise ValueError(errmsg("Expecting object", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000253 _append(value)
Brett Cannon4b964f92008-05-05 20:21:38 +0000254 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000255 if nextchar in _ws:
256 end = _w(s, end + 1).end()
257 nextchar = s[end:end + 1]
Brett Cannon4b964f92008-05-05 20:21:38 +0000258 end += 1
259 if nextchar == ']':
260 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000261 elif nextchar != ',':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200262 raise ValueError(errmsg("Expecting ',' delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000263 try:
264 if s[end] in _ws:
265 end += 1
266 if s[end] in _ws:
267 end = _w(s, end + 1).end()
268 except IndexError:
269 pass
270
Brett Cannon4b964f92008-05-05 20:21:38 +0000271 return values, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000272
273class JSONDecoder(object):
274 """Simple JSON <http://json.org> decoder
275
276 Performs the following translations in decoding by default:
277
278 +---------------+-------------------+
279 | JSON | Python |
280 +===============+===================+
281 | object | dict |
282 +---------------+-------------------+
283 | array | list |
284 +---------------+-------------------+
285 | string | unicode |
286 +---------------+-------------------+
287 | number (int) | int, long |
288 +---------------+-------------------+
289 | number (real) | float |
290 +---------------+-------------------+
291 | true | True |
292 +---------------+-------------------+
293 | false | False |
294 +---------------+-------------------+
295 | null | None |
296 +---------------+-------------------+
297
298 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
299 their corresponding ``float`` values, which is outside the JSON spec.
Brett Cannon4b964f92008-05-05 20:21:38 +0000300
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000301 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000302
303 def __init__(self, encoding=None, object_hook=None, parse_float=None,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000304 parse_int=None, parse_constant=None, strict=True,
305 object_pairs_hook=None):
Brett Cannon4b964f92008-05-05 20:21:38 +0000306 """``encoding`` determines the encoding used to interpret any ``str``
307 objects decoded by this instance (utf-8 by default). It has no
308 effect when decoding ``unicode`` objects.
309
310 Note that currently only encodings that are a superset of ASCII work,
311 strings of other encodings should be passed in as ``unicode``.
312
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000313 ``object_hook``, if specified, will be called with the result
314 of every JSON object decoded and its return value will be used in
Brett Cannon4b964f92008-05-05 20:21:38 +0000315 place of the given ``dict``. This can be used to provide custom
316 deserializations (e.g. to support JSON-RPC class hinting).
317
Georg Brandldb949b82010-10-15 17:04:45 +0000318 ``object_pairs_hook``, if specified will be called with the result of
319 every JSON object decoded with an ordered list of pairs. The return
320 value of ``object_pairs_hook`` will be used instead of the ``dict``.
321 This feature can be used to implement custom decoders that rely on the
322 order that the key and value pairs are decoded (for example,
323 collections.OrderedDict will remember the order of insertion). If
324 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
325 priority.
326
Brett Cannon4b964f92008-05-05 20:21:38 +0000327 ``parse_float``, if specified, will be called with the string
328 of every JSON float to be decoded. By default this is equivalent to
329 float(num_str). This can be used to use another datatype or parser
330 for JSON floats (e.g. decimal.Decimal).
331
332 ``parse_int``, if specified, will be called with the string
333 of every JSON int to be decoded. By default this is equivalent to
334 int(num_str). This can be used to use another datatype or parser
335 for JSON integers (e.g. float).
336
337 ``parse_constant``, if specified, will be called with one of the
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000338 following strings: -Infinity, Infinity, NaN.
Brett Cannon4b964f92008-05-05 20:21:38 +0000339 This can be used to raise an exception if invalid JSON numbers
340 are encountered.
341
Georg Brandldb949b82010-10-15 17:04:45 +0000342 If ``strict`` is false (true is the default), then control
343 characters will be allowed inside strings. Control characters in
344 this context are those with character codes in the 0-31 range,
345 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
346
Brett Cannon4b964f92008-05-05 20:21:38 +0000347 """
348 self.encoding = encoding
349 self.object_hook = object_hook
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000350 self.object_pairs_hook = object_pairs_hook
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000351 self.parse_float = parse_float or float
352 self.parse_int = parse_int or int
353 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Brett Cannon4b964f92008-05-05 20:21:38 +0000354 self.strict = strict
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000355 self.parse_object = JSONObject
356 self.parse_array = JSONArray
357 self.parse_string = scanstring
Ezio Melottie3992eb2011-05-14 06:24:53 +0300358 self.scan_once = scanner.make_scanner(self)
Brett Cannon4b964f92008-05-05 20:21:38 +0000359
360 def decode(self, s, _w=WHITESPACE.match):
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000361 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
Brett Cannon4b964f92008-05-05 20:21:38 +0000362 instance containing a JSON document)
363
364 """
365 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
366 end = _w(s, end).end()
367 if end != len(s):
368 raise ValueError(errmsg("Extra data", s, end, len(s)))
369 return obj
370
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000371 def raw_decode(self, s, idx=0):
372 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
373 beginning with a JSON document) and return a 2-tuple of the Python
Brett Cannon4b964f92008-05-05 20:21:38 +0000374 representation and the index in ``s`` where the document ended.
375
376 This can be used to decode a JSON document from a string that may
377 have extraneous data at the end.
378
379 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000380 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000381 obj, end = self.scan_once(s, idx)
Brett Cannon4b964f92008-05-05 20:21:38 +0000382 except StopIteration:
383 raise ValueError("No JSON object could be decoded")
384 return obj, end