blob: 80d3420a95c8092572806580841ebf57ebda5e88 [file] [log] [blame]
Christian Heimes90540002008-05-08 14:29:10 +00001"""Implementation of JSONDecoder
2"""
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00003import binascii
Christian Heimes90540002008-05-08 14:29:10 +00004import re
5import sys
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00006import struct
Christian Heimes90540002008-05-08 14:29:10 +00007
Ezio Melotti6b60fb92011-05-14 06:47:51 +03008from json import scanner
Christian Heimes90540002008-05-08 14:29:10 +00009try:
10 from _json import scanstring as c_scanstring
11except ImportError:
12 c_scanstring = None
13
14__all__ = ['JSONDecoder']
15
16FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
17
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000018def _floatconstants():
19 _BYTES = binascii.unhexlify(b'7FF80000000000007FF0000000000000')
20 if sys.byteorder != 'big':
21 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
22 nan, inf = struct.unpack('dd', _BYTES)
23 return nan, inf, -inf
24
25NaN, PosInf, NegInf = _floatconstants()
Christian Heimes90540002008-05-08 14:29:10 +000026
27
28def linecol(doc, pos):
Benjamin Petersona13d4752008-10-16 21:17:24 +000029 if isinstance(doc, bytes):
30 newline = b'\n'
31 else:
32 newline = '\n'
33 lineno = doc.count(newline, 0, pos) + 1
Christian Heimes90540002008-05-08 14:29:10 +000034 if lineno == 1:
Serhiy Storchakac510a042013-02-21 20:19:16 +020035 colno = pos + 1
Christian Heimes90540002008-05-08 14:29:10 +000036 else:
Benjamin Petersona13d4752008-10-16 21:17:24 +000037 colno = pos - doc.rindex(newline, 0, pos)
Christian Heimes90540002008-05-08 14:29:10 +000038 return lineno, colno
39
40
41def errmsg(msg, doc, pos, end=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000042 # Note that this function is called from _json
Christian Heimes90540002008-05-08 14:29:10 +000043 lineno, colno = linecol(doc, pos)
44 if end is None:
45 fmt = '{0}: line {1} column {2} (char {3})'
46 return fmt.format(msg, lineno, colno, pos)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000047 #fmt = '%s: line %d column %d (char %d)'
48 #return fmt % (msg, lineno, colno, pos)
Christian Heimes90540002008-05-08 14:29:10 +000049 endlineno, endcolno = linecol(doc, end)
50 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
51 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000052 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
53 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Christian Heimes90540002008-05-08 14:29:10 +000054
55
56_CONSTANTS = {
57 '-Infinity': NegInf,
58 'Infinity': PosInf,
59 'NaN': NaN,
Christian Heimes90540002008-05-08 14:29:10 +000060}
61
62
Christian Heimes90540002008-05-08 14:29:10 +000063STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
64BACKSLASH = {
65 '"': '"', '\\': '\\', '/': '/',
66 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
67}
68
Serhiy Storchakac93329b2013-11-26 21:25:28 +020069def _decode_uXXXX(s, pos):
70 esc = s[pos + 1:pos + 5]
71 if len(esc) == 4 and esc[1] not in 'xX':
72 try:
73 return int(esc, 16)
74 except ValueError:
75 pass
76 msg = "Invalid \\uXXXX escape"
77 raise ValueError(errmsg(msg, s, pos))
78
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000079def py_scanstring(s, end, strict=True,
80 _b=BACKSLASH, _m=STRINGCHUNK.match):
81 """Scan the string s for a JSON string. End is the index of the
82 character in s after the quote that started the JSON string.
83 Unescapes all valid JSON string escape sequences and raises ValueError
84 on attempt to decode an invalid string. If strict is False then literal
85 control characters are allowed in the string.
Christian Heimes90540002008-05-08 14:29:10 +000086
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000087 Returns a tuple of the decoded string and the index of the character in s
88 after the end quote."""
Christian Heimes90540002008-05-08 14:29:10 +000089 chunks = []
90 _append = chunks.append
91 begin = end - 1
92 while 1:
93 chunk = _m(s, end)
94 if chunk is None:
95 raise ValueError(
96 errmsg("Unterminated string starting at", s, begin))
97 end = chunk.end()
98 content, terminator = chunk.groups()
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000099 # Content is contains zero or more unescaped string characters
Christian Heimes90540002008-05-08 14:29:10 +0000100 if content:
Christian Heimes90540002008-05-08 14:29:10 +0000101 _append(content)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000102 # Terminator is the end of string, a literal control character,
103 # or a backslash denoting that an escape sequence follows
Christian Heimes90540002008-05-08 14:29:10 +0000104 if terminator == '"':
105 break
106 elif terminator != '\\':
107 if strict:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000108 #msg = "Invalid control character %r at" % (terminator,)
Christian Heimes90540002008-05-08 14:29:10 +0000109 msg = "Invalid control character {0!r} at".format(terminator)
110 raise ValueError(errmsg(msg, s, end))
111 else:
112 _append(terminator)
113 continue
114 try:
115 esc = s[end]
116 except IndexError:
117 raise ValueError(
118 errmsg("Unterminated string starting at", s, begin))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000119 # If not a unicode escape sequence, must be in the lookup table
Christian Heimes90540002008-05-08 14:29:10 +0000120 if esc != 'u':
121 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000122 char = _b[esc]
Christian Heimes90540002008-05-08 14:29:10 +0000123 except KeyError:
124 msg = "Invalid \\escape: {0!r}".format(esc)
125 raise ValueError(errmsg(msg, s, end))
126 end += 1
127 else:
Serhiy Storchakac93329b2013-11-26 21:25:28 +0200128 uni = _decode_uXXXX(s, end)
129 end += 5
130 if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
131 uni2 = _decode_uXXXX(s, end + 1)
132 if 0xdc00 <= uni2 <= 0xdfff:
133 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
134 end += 6
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000135 char = chr(uni)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000136 _append(char)
Christian Heimes90540002008-05-08 14:29:10 +0000137 return ''.join(chunks), end
138
139
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000140# Use speedup if available
141scanstring = c_scanstring or py_scanstring
Christian Heimes90540002008-05-08 14:29:10 +0000142
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000143WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
144WHITESPACE_STR = ' \t\n\r'
Christian Heimes90540002008-05-08 14:29:10 +0000145
146
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000147def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000148 memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000149 s, end = s_and_end
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000150 pairs = []
151 pairs_append = pairs.append
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000152 # Backwards compatibility
153 if memo is None:
154 memo = {}
155 memo_get = memo.setdefault
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000156 # Use a slice to prevent IndexError from being raised, the following
157 # check will raise a more specific ValueError if the string is empty
Christian Heimes90540002008-05-08 14:29:10 +0000158 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000159 # Normally we expect nextchar == '"'
Christian Heimes90540002008-05-08 14:29:10 +0000160 if nextchar != '"':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000161 if nextchar in _ws:
162 end = _w(s, end).end()
163 nextchar = s[end:end + 1]
164 # Trivial empty object
165 if nextchar == '}':
Ezio Melottid210aa12011-04-13 07:10:13 +0300166 if object_pairs_hook is not None:
167 result = object_pairs_hook(pairs)
Ezio Melottia7d64a62013-03-13 01:52:34 +0200168 return result, end + 1
Ezio Melottid210aa12011-04-13 07:10:13 +0300169 pairs = {}
170 if object_hook is not None:
171 pairs = object_hook(pairs)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000172 return pairs, end + 1
173 elif nextchar != '"':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200174 raise ValueError(errmsg(
175 "Expecting property name enclosed in double quotes", s, end))
Christian Heimes90540002008-05-08 14:29:10 +0000176 end += 1
Christian Heimes90540002008-05-08 14:29:10 +0000177 while True:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000178 key, end = scanstring(s, end, strict)
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000179 key = memo_get(key, key)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000180 # To skip some function call overhead we optimize the fast paths where
181 # the JSON key separator is ": " or just ":".
Christian Heimes90540002008-05-08 14:29:10 +0000182 if s[end:end + 1] != ':':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000183 end = _w(s, end).end()
184 if s[end:end + 1] != ':':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200185 raise ValueError(errmsg("Expecting ':' delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000186 end += 1
187
Christian Heimes90540002008-05-08 14:29:10 +0000188 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000189 if s[end] in _ws:
190 end += 1
191 if s[end] in _ws:
192 end = _w(s, end + 1).end()
193 except IndexError:
194 pass
195
196 try:
197 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000198 except StopIteration:
199 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000200 pairs_append((key, value))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000201 try:
202 nextchar = s[end]
203 if nextchar in _ws:
204 end = _w(s, end + 1).end()
205 nextchar = s[end]
206 except IndexError:
207 nextchar = ''
Christian Heimes90540002008-05-08 14:29:10 +0000208 end += 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000209
Christian Heimes90540002008-05-08 14:29:10 +0000210 if nextchar == '}':
211 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000212 elif nextchar != ',':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200213 raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
Christian Heimes90540002008-05-08 14:29:10 +0000214 end = _w(s, end).end()
215 nextchar = s[end:end + 1]
216 end += 1
217 if nextchar != '"':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200218 raise ValueError(errmsg(
219 "Expecting property name enclosed in double quotes", s, end - 1))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000220 if object_pairs_hook is not None:
221 result = object_pairs_hook(pairs)
222 return result, end
223 pairs = dict(pairs)
Christian Heimes90540002008-05-08 14:29:10 +0000224 if object_hook is not None:
225 pairs = object_hook(pairs)
226 return pairs, end
Christian Heimes90540002008-05-08 14:29:10 +0000227
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000228def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000229 s, end = s_and_end
Christian Heimes90540002008-05-08 14:29:10 +0000230 values = []
Christian Heimes90540002008-05-08 14:29:10 +0000231 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000232 if nextchar in _ws:
233 end = _w(s, end + 1).end()
234 nextchar = s[end:end + 1]
235 # Look-ahead for trivial empty array
Christian Heimes90540002008-05-08 14:29:10 +0000236 if nextchar == ']':
237 return values, end + 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000238 _append = values.append
Christian Heimes90540002008-05-08 14:29:10 +0000239 while True:
240 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000241 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000242 except StopIteration:
243 raise ValueError(errmsg("Expecting object", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000244 _append(value)
Christian Heimes90540002008-05-08 14:29:10 +0000245 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000246 if nextchar in _ws:
247 end = _w(s, end + 1).end()
248 nextchar = s[end:end + 1]
Christian Heimes90540002008-05-08 14:29:10 +0000249 end += 1
250 if nextchar == ']':
251 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000252 elif nextchar != ',':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200253 raise ValueError(errmsg("Expecting ',' delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000254 try:
255 if s[end] in _ws:
256 end += 1
257 if s[end] in _ws:
258 end = _w(s, end + 1).end()
259 except IndexError:
260 pass
261
Christian Heimes90540002008-05-08 14:29:10 +0000262 return values, end
Christian Heimes90540002008-05-08 14:29:10 +0000263
264
265class JSONDecoder(object):
266 """Simple JSON <http://json.org> decoder
267
268 Performs the following translations in decoding by default:
269
270 +---------------+-------------------+
271 | JSON | Python |
272 +===============+===================+
273 | object | dict |
274 +---------------+-------------------+
275 | array | list |
276 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000277 | string | str |
Christian Heimes90540002008-05-08 14:29:10 +0000278 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000279 | number (int) | int |
Christian Heimes90540002008-05-08 14:29:10 +0000280 +---------------+-------------------+
281 | number (real) | float |
282 +---------------+-------------------+
283 | true | True |
284 +---------------+-------------------+
285 | false | False |
286 +---------------+-------------------+
287 | null | None |
288 +---------------+-------------------+
289
290 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
291 their corresponding ``float`` values, which is outside the JSON spec.
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000292
Christian Heimes90540002008-05-08 14:29:10 +0000293 """
294
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000295 def __init__(self, object_hook=None, parse_float=None,
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000296 parse_int=None, parse_constant=None, strict=True,
297 object_pairs_hook=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000298 """``object_hook``, if specified, will be called with the result
299 of every JSON object decoded and its return value will be used in
Christian Heimes90540002008-05-08 14:29:10 +0000300 place of the given ``dict``. This can be used to provide custom
301 deserializations (e.g. to support JSON-RPC class hinting).
302
Georg Brandld4460aa2010-10-15 17:03:02 +0000303 ``object_pairs_hook``, if specified will be called with the result of
304 every JSON object decoded with an ordered list of pairs. The return
305 value of ``object_pairs_hook`` will be used instead of the ``dict``.
306 This feature can be used to implement custom decoders that rely on the
307 order that the key and value pairs are decoded (for example,
308 collections.OrderedDict will remember the order of insertion). If
309 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
310 priority.
311
Christian Heimes90540002008-05-08 14:29:10 +0000312 ``parse_float``, if specified, will be called with the string
313 of every JSON float to be decoded. By default this is equivalent to
314 float(num_str). This can be used to use another datatype or parser
315 for JSON floats (e.g. decimal.Decimal).
316
317 ``parse_int``, if specified, will be called with the string
318 of every JSON int to be decoded. By default this is equivalent to
319 int(num_str). This can be used to use another datatype or parser
320 for JSON integers (e.g. float).
321
322 ``parse_constant``, if specified, will be called with one of the
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000323 following strings: -Infinity, Infinity, NaN.
Christian Heimes90540002008-05-08 14:29:10 +0000324 This can be used to raise an exception if invalid JSON numbers
325 are encountered.
326
Georg Brandld4460aa2010-10-15 17:03:02 +0000327 If ``strict`` is false (true is the default), then control
328 characters will be allowed inside strings. Control characters in
329 this context are those with character codes in the 0-31 range,
330 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
331
Christian Heimes90540002008-05-08 14:29:10 +0000332 """
Christian Heimes90540002008-05-08 14:29:10 +0000333 self.object_hook = object_hook
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000334 self.parse_float = parse_float or float
335 self.parse_int = parse_int or int
336 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Christian Heimes90540002008-05-08 14:29:10 +0000337 self.strict = strict
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000338 self.object_pairs_hook = object_pairs_hook
339 self.parse_object = JSONObject
340 self.parse_array = JSONArray
341 self.parse_string = scanstring
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000342 self.memo = {}
Ezio Melotti6b60fb92011-05-14 06:47:51 +0300343 self.scan_once = scanner.make_scanner(self)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000344
Christian Heimes90540002008-05-08 14:29:10 +0000345
346 def decode(self, s, _w=WHITESPACE.match):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000347 """Return the Python representation of ``s`` (a ``str`` instance
348 containing a JSON document).
Christian Heimes90540002008-05-08 14:29:10 +0000349
350 """
351 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
352 end = _w(s, end).end()
353 if end != len(s):
354 raise ValueError(errmsg("Extra data", s, end, len(s)))
355 return obj
356
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000357 def raw_decode(self, s, idx=0):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000358 """Decode a JSON document from ``s`` (a ``str`` beginning with
359 a JSON document) and return a 2-tuple of the Python
Christian Heimes90540002008-05-08 14:29:10 +0000360 representation and the index in ``s`` where the document ended.
361
362 This can be used to decode a JSON document from a string that may
363 have extraneous data at the end.
364
365 """
Christian Heimes90540002008-05-08 14:29:10 +0000366 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000367 obj, end = self.scan_once(s, idx)
Christian Heimes90540002008-05-08 14:29:10 +0000368 except StopIteration:
369 raise ValueError("No JSON object could be decoded")
370 return obj, end