blob: 938ebffb11a09417e1e06f802f2ea320a907f942 [file] [log] [blame]
Christian Heimes90540002008-05-08 14:29:10 +00001"""Implementation of JSONDecoder
2"""
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00003import binascii
Christian Heimes90540002008-05-08 14:29:10 +00004import re
5import sys
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00006import struct
Christian Heimes90540002008-05-08 14:29:10 +00007
Ezio Melotti6b60fb92011-05-14 06:47:51 +03008from json import scanner
Christian Heimes90540002008-05-08 14:29:10 +00009try:
10 from _json import scanstring as c_scanstring
11except ImportError:
12 c_scanstring = None
13
14__all__ = ['JSONDecoder']
15
16FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
17
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000018def _floatconstants():
19 _BYTES = binascii.unhexlify(b'7FF80000000000007FF0000000000000')
20 if sys.byteorder != 'big':
21 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
22 nan, inf = struct.unpack('dd', _BYTES)
23 return nan, inf, -inf
24
25NaN, PosInf, NegInf = _floatconstants()
Christian Heimes90540002008-05-08 14:29:10 +000026
27
28def linecol(doc, pos):
Benjamin Petersona13d4752008-10-16 21:17:24 +000029 if isinstance(doc, bytes):
30 newline = b'\n'
31 else:
32 newline = '\n'
33 lineno = doc.count(newline, 0, pos) + 1
Christian Heimes90540002008-05-08 14:29:10 +000034 if lineno == 1:
Serhiy Storchakac510a042013-02-21 20:19:16 +020035 colno = pos + 1
Christian Heimes90540002008-05-08 14:29:10 +000036 else:
Benjamin Petersona13d4752008-10-16 21:17:24 +000037 colno = pos - doc.rindex(newline, 0, pos)
Christian Heimes90540002008-05-08 14:29:10 +000038 return lineno, colno
39
40
41def errmsg(msg, doc, pos, end=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000042 # Note that this function is called from _json
Christian Heimes90540002008-05-08 14:29:10 +000043 lineno, colno = linecol(doc, pos)
44 if end is None:
45 fmt = '{0}: line {1} column {2} (char {3})'
46 return fmt.format(msg, lineno, colno, pos)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000047 #fmt = '%s: line %d column %d (char %d)'
48 #return fmt % (msg, lineno, colno, pos)
Christian Heimes90540002008-05-08 14:29:10 +000049 endlineno, endcolno = linecol(doc, end)
50 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
51 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000052 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
53 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Christian Heimes90540002008-05-08 14:29:10 +000054
55
56_CONSTANTS = {
57 '-Infinity': NegInf,
58 'Infinity': PosInf,
59 'NaN': NaN,
Christian Heimes90540002008-05-08 14:29:10 +000060}
61
62
Christian Heimes90540002008-05-08 14:29:10 +000063STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
64BACKSLASH = {
65 '"': '"', '\\': '\\', '/': '/',
66 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
67}
68
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000069def py_scanstring(s, end, strict=True,
70 _b=BACKSLASH, _m=STRINGCHUNK.match):
71 """Scan the string s for a JSON string. End is the index of the
72 character in s after the quote that started the JSON string.
73 Unescapes all valid JSON string escape sequences and raises ValueError
74 on attempt to decode an invalid string. If strict is False then literal
75 control characters are allowed in the string.
Christian Heimes90540002008-05-08 14:29:10 +000076
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000077 Returns a tuple of the decoded string and the index of the character in s
78 after the end quote."""
Christian Heimes90540002008-05-08 14:29:10 +000079 chunks = []
80 _append = chunks.append
81 begin = end - 1
82 while 1:
83 chunk = _m(s, end)
84 if chunk is None:
85 raise ValueError(
86 errmsg("Unterminated string starting at", s, begin))
87 end = chunk.end()
88 content, terminator = chunk.groups()
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000089 # Content is contains zero or more unescaped string characters
Christian Heimes90540002008-05-08 14:29:10 +000090 if content:
Christian Heimes90540002008-05-08 14:29:10 +000091 _append(content)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Christian Heimes90540002008-05-08 14:29:10 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000098 #msg = "Invalid control character %r at" % (terminator,)
Christian Heimes90540002008-05-08 14:29:10 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000109 # If not a unicode escape sequence, must be in the lookup table
Christian Heimes90540002008-05-08 14:29:10 +0000110 if esc != 'u':
111 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000112 char = _b[esc]
Christian Heimes90540002008-05-08 14:29:10 +0000113 except KeyError:
114 msg = "Invalid \\escape: {0!r}".format(esc)
115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
118 esc = s[end + 1:end + 5]
119 next_end = end + 5
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000120 if len(esc) != 4:
121 msg = "Invalid \\uXXXX escape"
Christian Heimes90540002008-05-08 14:29:10 +0000122 raise ValueError(errmsg(msg, s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000123 uni = int(esc, 16)
124 # Check for surrogate pair on UCS-4 systems
125 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
126 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
127 if not s[end + 5:end + 7] == '\\u':
128 raise ValueError(errmsg(msg, s, end))
129 esc2 = s[end + 7:end + 11]
130 if len(esc2) != 4:
131 raise ValueError(errmsg(msg, s, end))
132 uni2 = int(esc2, 16)
133 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
134 next_end += 6
135 char = chr(uni)
136
Christian Heimes90540002008-05-08 14:29:10 +0000137 end = next_end
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000138 _append(char)
Christian Heimes90540002008-05-08 14:29:10 +0000139 return ''.join(chunks), end
140
141
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000142# Use speedup if available
143scanstring = c_scanstring or py_scanstring
Christian Heimes90540002008-05-08 14:29:10 +0000144
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000145WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
146WHITESPACE_STR = ' \t\n\r'
Christian Heimes90540002008-05-08 14:29:10 +0000147
148
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000149def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000150 memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000151 s, end = s_and_end
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000152 pairs = []
153 pairs_append = pairs.append
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000154 # Backwards compatibility
155 if memo is None:
156 memo = {}
157 memo_get = memo.setdefault
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000158 # Use a slice to prevent IndexError from being raised, the following
159 # check will raise a more specific ValueError if the string is empty
Christian Heimes90540002008-05-08 14:29:10 +0000160 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000161 # Normally we expect nextchar == '"'
Christian Heimes90540002008-05-08 14:29:10 +0000162 if nextchar != '"':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000163 if nextchar in _ws:
164 end = _w(s, end).end()
165 nextchar = s[end:end + 1]
166 # Trivial empty object
167 if nextchar == '}':
Ezio Melottid210aa12011-04-13 07:10:13 +0300168 if object_pairs_hook is not None:
169 result = object_pairs_hook(pairs)
Ezio Melottia7d64a62013-03-13 01:52:34 +0200170 return result, end + 1
Ezio Melottid210aa12011-04-13 07:10:13 +0300171 pairs = {}
172 if object_hook is not None:
173 pairs = object_hook(pairs)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000174 return pairs, end + 1
175 elif nextchar != '"':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200176 raise ValueError(errmsg(
177 "Expecting property name enclosed in double quotes", s, end))
Christian Heimes90540002008-05-08 14:29:10 +0000178 end += 1
Christian Heimes90540002008-05-08 14:29:10 +0000179 while True:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000180 key, end = scanstring(s, end, strict)
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000181 key = memo_get(key, key)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000182 # To skip some function call overhead we optimize the fast paths where
183 # the JSON key separator is ": " or just ":".
Christian Heimes90540002008-05-08 14:29:10 +0000184 if s[end:end + 1] != ':':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000185 end = _w(s, end).end()
186 if s[end:end + 1] != ':':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200187 raise ValueError(errmsg("Expecting ':' delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000188 end += 1
189
Christian Heimes90540002008-05-08 14:29:10 +0000190 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000191 if s[end] in _ws:
192 end += 1
193 if s[end] in _ws:
194 end = _w(s, end + 1).end()
195 except IndexError:
196 pass
197
198 try:
199 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000200 except StopIteration:
201 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000202 pairs_append((key, value))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000203 try:
204 nextchar = s[end]
205 if nextchar in _ws:
206 end = _w(s, end + 1).end()
207 nextchar = s[end]
208 except IndexError:
209 nextchar = ''
Christian Heimes90540002008-05-08 14:29:10 +0000210 end += 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000211
Christian Heimes90540002008-05-08 14:29:10 +0000212 if nextchar == '}':
213 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000214 elif nextchar != ',':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200215 raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
Christian Heimes90540002008-05-08 14:29:10 +0000216 end = _w(s, end).end()
217 nextchar = s[end:end + 1]
218 end += 1
219 if nextchar != '"':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200220 raise ValueError(errmsg(
221 "Expecting property name enclosed in double quotes", s, end - 1))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000222 if object_pairs_hook is not None:
223 result = object_pairs_hook(pairs)
224 return result, end
225 pairs = dict(pairs)
Christian Heimes90540002008-05-08 14:29:10 +0000226 if object_hook is not None:
227 pairs = object_hook(pairs)
228 return pairs, end
Christian Heimes90540002008-05-08 14:29:10 +0000229
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000230def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000231 s, end = s_and_end
Christian Heimes90540002008-05-08 14:29:10 +0000232 values = []
Christian Heimes90540002008-05-08 14:29:10 +0000233 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000234 if nextchar in _ws:
235 end = _w(s, end + 1).end()
236 nextchar = s[end:end + 1]
237 # Look-ahead for trivial empty array
Christian Heimes90540002008-05-08 14:29:10 +0000238 if nextchar == ']':
239 return values, end + 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000240 _append = values.append
Christian Heimes90540002008-05-08 14:29:10 +0000241 while True:
242 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000243 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000244 except StopIteration:
245 raise ValueError(errmsg("Expecting object", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000246 _append(value)
Christian Heimes90540002008-05-08 14:29:10 +0000247 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000248 if nextchar in _ws:
249 end = _w(s, end + 1).end()
250 nextchar = s[end:end + 1]
Christian Heimes90540002008-05-08 14:29:10 +0000251 end += 1
252 if nextchar == ']':
253 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000254 elif nextchar != ',':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200255 raise ValueError(errmsg("Expecting ',' delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000256 try:
257 if s[end] in _ws:
258 end += 1
259 if s[end] in _ws:
260 end = _w(s, end + 1).end()
261 except IndexError:
262 pass
263
Christian Heimes90540002008-05-08 14:29:10 +0000264 return values, end
Christian Heimes90540002008-05-08 14:29:10 +0000265
266
267class JSONDecoder(object):
268 """Simple JSON <http://json.org> decoder
269
270 Performs the following translations in decoding by default:
271
272 +---------------+-------------------+
273 | JSON | Python |
274 +===============+===================+
275 | object | dict |
276 +---------------+-------------------+
277 | array | list |
278 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000279 | string | str |
Christian Heimes90540002008-05-08 14:29:10 +0000280 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000281 | number (int) | int |
Christian Heimes90540002008-05-08 14:29:10 +0000282 +---------------+-------------------+
283 | number (real) | float |
284 +---------------+-------------------+
285 | true | True |
286 +---------------+-------------------+
287 | false | False |
288 +---------------+-------------------+
289 | null | None |
290 +---------------+-------------------+
291
292 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
293 their corresponding ``float`` values, which is outside the JSON spec.
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000294
Christian Heimes90540002008-05-08 14:29:10 +0000295 """
296
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000297 def __init__(self, object_hook=None, parse_float=None,
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000298 parse_int=None, parse_constant=None, strict=True,
299 object_pairs_hook=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000300 """``object_hook``, if specified, will be called with the result
301 of every JSON object decoded and its return value will be used in
Christian Heimes90540002008-05-08 14:29:10 +0000302 place of the given ``dict``. This can be used to provide custom
303 deserializations (e.g. to support JSON-RPC class hinting).
304
Georg Brandld4460aa2010-10-15 17:03:02 +0000305 ``object_pairs_hook``, if specified will be called with the result of
306 every JSON object decoded with an ordered list of pairs. The return
307 value of ``object_pairs_hook`` will be used instead of the ``dict``.
308 This feature can be used to implement custom decoders that rely on the
309 order that the key and value pairs are decoded (for example,
310 collections.OrderedDict will remember the order of insertion). If
311 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
312 priority.
313
Christian Heimes90540002008-05-08 14:29:10 +0000314 ``parse_float``, if specified, will be called with the string
315 of every JSON float to be decoded. By default this is equivalent to
316 float(num_str). This can be used to use another datatype or parser
317 for JSON floats (e.g. decimal.Decimal).
318
319 ``parse_int``, if specified, will be called with the string
320 of every JSON int to be decoded. By default this is equivalent to
321 int(num_str). This can be used to use another datatype or parser
322 for JSON integers (e.g. float).
323
324 ``parse_constant``, if specified, will be called with one of the
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000325 following strings: -Infinity, Infinity, NaN.
Christian Heimes90540002008-05-08 14:29:10 +0000326 This can be used to raise an exception if invalid JSON numbers
327 are encountered.
328
Georg Brandld4460aa2010-10-15 17:03:02 +0000329 If ``strict`` is false (true is the default), then control
330 characters will be allowed inside strings. Control characters in
331 this context are those with character codes in the 0-31 range,
332 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
333
Christian Heimes90540002008-05-08 14:29:10 +0000334 """
Christian Heimes90540002008-05-08 14:29:10 +0000335 self.object_hook = object_hook
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000336 self.parse_float = parse_float or float
337 self.parse_int = parse_int or int
338 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Christian Heimes90540002008-05-08 14:29:10 +0000339 self.strict = strict
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000340 self.object_pairs_hook = object_pairs_hook
341 self.parse_object = JSONObject
342 self.parse_array = JSONArray
343 self.parse_string = scanstring
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000344 self.memo = {}
Ezio Melotti6b60fb92011-05-14 06:47:51 +0300345 self.scan_once = scanner.make_scanner(self)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000346
Christian Heimes90540002008-05-08 14:29:10 +0000347
348 def decode(self, s, _w=WHITESPACE.match):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000349 """Return the Python representation of ``s`` (a ``str`` instance
350 containing a JSON document).
Christian Heimes90540002008-05-08 14:29:10 +0000351
352 """
353 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
354 end = _w(s, end).end()
355 if end != len(s):
356 raise ValueError(errmsg("Extra data", s, end, len(s)))
357 return obj
358
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000359 def raw_decode(self, s, idx=0):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000360 """Decode a JSON document from ``s`` (a ``str`` beginning with
361 a JSON document) and return a 2-tuple of the Python
Christian Heimes90540002008-05-08 14:29:10 +0000362 representation and the index in ``s`` where the document ended.
363
364 This can be used to decode a JSON document from a string that may
365 have extraneous data at the end.
366
367 """
Christian Heimes90540002008-05-08 14:29:10 +0000368 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000369 obj, end = self.scan_once(s, idx)
Christian Heimes90540002008-05-08 14:29:10 +0000370 except StopIteration:
371 raise ValueError("No JSON object could be decoded")
372 return obj, end