blob: e7c0539b86a555d11179812ba81c167efb2a0a05 [file] [log] [blame]
Christian Heimes90540002008-05-08 14:29:10 +00001"""Implementation of JSONDecoder
2"""
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00003import binascii
Christian Heimes90540002008-05-08 14:29:10 +00004import re
5import sys
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00006import struct
Christian Heimes90540002008-05-08 14:29:10 +00007
Ezio Melotti6b60fb92011-05-14 06:47:51 +03008from json import scanner
Christian Heimes90540002008-05-08 14:29:10 +00009try:
10 from _json import scanstring as c_scanstring
11except ImportError:
12 c_scanstring = None
13
14__all__ = ['JSONDecoder']
15
16FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
17
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000018def _floatconstants():
19 _BYTES = binascii.unhexlify(b'7FF80000000000007FF0000000000000')
20 if sys.byteorder != 'big':
21 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
22 nan, inf = struct.unpack('dd', _BYTES)
23 return nan, inf, -inf
24
25NaN, PosInf, NegInf = _floatconstants()
Christian Heimes90540002008-05-08 14:29:10 +000026
27
28def linecol(doc, pos):
Benjamin Petersona13d4752008-10-16 21:17:24 +000029 if isinstance(doc, bytes):
30 newline = b'\n'
31 else:
32 newline = '\n'
33 lineno = doc.count(newline, 0, pos) + 1
Christian Heimes90540002008-05-08 14:29:10 +000034 if lineno == 1:
35 colno = pos
36 else:
Benjamin Petersona13d4752008-10-16 21:17:24 +000037 colno = pos - doc.rindex(newline, 0, pos)
Christian Heimes90540002008-05-08 14:29:10 +000038 return lineno, colno
39
40
41def errmsg(msg, doc, pos, end=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000042 # Note that this function is called from _json
Christian Heimes90540002008-05-08 14:29:10 +000043 lineno, colno = linecol(doc, pos)
44 if end is None:
45 fmt = '{0}: line {1} column {2} (char {3})'
46 return fmt.format(msg, lineno, colno, pos)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000047 #fmt = '%s: line %d column %d (char %d)'
48 #return fmt % (msg, lineno, colno, pos)
Christian Heimes90540002008-05-08 14:29:10 +000049 endlineno, endcolno = linecol(doc, end)
50 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
51 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000052 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
53 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Christian Heimes90540002008-05-08 14:29:10 +000054
55
56_CONSTANTS = {
57 '-Infinity': NegInf,
58 'Infinity': PosInf,
59 'NaN': NaN,
Christian Heimes90540002008-05-08 14:29:10 +000060}
61
62
Christian Heimes90540002008-05-08 14:29:10 +000063STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
64BACKSLASH = {
65 '"': '"', '\\': '\\', '/': '/',
66 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
67}
68
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000069def py_scanstring(s, end, strict=True,
70 _b=BACKSLASH, _m=STRINGCHUNK.match):
71 """Scan the string s for a JSON string. End is the index of the
72 character in s after the quote that started the JSON string.
73 Unescapes all valid JSON string escape sequences and raises ValueError
74 on attempt to decode an invalid string. If strict is False then literal
75 control characters are allowed in the string.
Christian Heimes90540002008-05-08 14:29:10 +000076
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000077 Returns a tuple of the decoded string and the index of the character in s
78 after the end quote."""
Christian Heimes90540002008-05-08 14:29:10 +000079 chunks = []
80 _append = chunks.append
81 begin = end - 1
82 while 1:
83 chunk = _m(s, end)
84 if chunk is None:
85 raise ValueError(
86 errmsg("Unterminated string starting at", s, begin))
87 end = chunk.end()
88 content, terminator = chunk.groups()
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000089 # Content is contains zero or more unescaped string characters
Christian Heimes90540002008-05-08 14:29:10 +000090 if content:
Christian Heimes90540002008-05-08 14:29:10 +000091 _append(content)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Christian Heimes90540002008-05-08 14:29:10 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000098 #msg = "Invalid control character %r at" % (terminator,)
Christian Heimes90540002008-05-08 14:29:10 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000109 # If not a unicode escape sequence, must be in the lookup table
Christian Heimes90540002008-05-08 14:29:10 +0000110 if esc != 'u':
111 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000112 char = _b[esc]
Christian Heimes90540002008-05-08 14:29:10 +0000113 except KeyError:
114 msg = "Invalid \\escape: {0!r}".format(esc)
115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
118 esc = s[end + 1:end + 5]
119 next_end = end + 5
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000120 if len(esc) != 4:
121 msg = "Invalid \\uXXXX escape"
Christian Heimes90540002008-05-08 14:29:10 +0000122 raise ValueError(errmsg(msg, s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000123 uni = int(esc, 16)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 if 0xd800 <= uni <= 0xdbff:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000125 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
126 if not s[end + 5:end + 7] == '\\u':
127 raise ValueError(errmsg(msg, s, end))
128 esc2 = s[end + 7:end + 11]
129 if len(esc2) != 4:
130 raise ValueError(errmsg(msg, s, end))
131 uni2 = int(esc2, 16)
132 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
133 next_end += 6
134 char = chr(uni)
135
Christian Heimes90540002008-05-08 14:29:10 +0000136 end = next_end
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000137 _append(char)
Christian Heimes90540002008-05-08 14:29:10 +0000138 return ''.join(chunks), end
139
140
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000141# Use speedup if available
142scanstring = c_scanstring or py_scanstring
Christian Heimes90540002008-05-08 14:29:10 +0000143
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000144WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
145WHITESPACE_STR = ' \t\n\r'
Christian Heimes90540002008-05-08 14:29:10 +0000146
147
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000148def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000149 memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000150 s, end = s_and_end
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000151 pairs = []
152 pairs_append = pairs.append
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000153 # Backwards compatibility
154 if memo is None:
155 memo = {}
156 memo_get = memo.setdefault
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000157 # Use a slice to prevent IndexError from being raised, the following
158 # check will raise a more specific ValueError if the string is empty
Christian Heimes90540002008-05-08 14:29:10 +0000159 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000160 # Normally we expect nextchar == '"'
Christian Heimes90540002008-05-08 14:29:10 +0000161 if nextchar != '"':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000162 if nextchar in _ws:
163 end = _w(s, end).end()
164 nextchar = s[end:end + 1]
165 # Trivial empty object
166 if nextchar == '}':
Ezio Melottid210aa12011-04-13 07:10:13 +0300167 if object_pairs_hook is not None:
168 result = object_pairs_hook(pairs)
169 return result, end
170 pairs = {}
171 if object_hook is not None:
172 pairs = object_hook(pairs)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000173 return pairs, end + 1
174 elif nextchar != '"':
175 raise ValueError(errmsg("Expecting property name", s, end))
Christian Heimes90540002008-05-08 14:29:10 +0000176 end += 1
Christian Heimes90540002008-05-08 14:29:10 +0000177 while True:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000178 key, end = scanstring(s, end, strict)
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000179 key = memo_get(key, key)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000180 # To skip some function call overhead we optimize the fast paths where
181 # the JSON key separator is ": " or just ":".
Christian Heimes90540002008-05-08 14:29:10 +0000182 if s[end:end + 1] != ':':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000183 end = _w(s, end).end()
184 if s[end:end + 1] != ':':
185 raise ValueError(errmsg("Expecting : delimiter", s, end))
186 end += 1
187
Christian Heimes90540002008-05-08 14:29:10 +0000188 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000189 if s[end] in _ws:
190 end += 1
191 if s[end] in _ws:
192 end = _w(s, end + 1).end()
193 except IndexError:
194 pass
195
196 try:
197 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000198 except StopIteration:
199 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000200 pairs_append((key, value))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000201 try:
202 nextchar = s[end]
203 if nextchar in _ws:
204 end = _w(s, end + 1).end()
205 nextchar = s[end]
206 except IndexError:
207 nextchar = ''
Christian Heimes90540002008-05-08 14:29:10 +0000208 end += 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000209
Christian Heimes90540002008-05-08 14:29:10 +0000210 if nextchar == '}':
211 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000212 elif nextchar != ',':
Christian Heimes90540002008-05-08 14:29:10 +0000213 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
214 end = _w(s, end).end()
215 nextchar = s[end:end + 1]
216 end += 1
217 if nextchar != '"':
218 raise ValueError(errmsg("Expecting property name", s, end - 1))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000219 if object_pairs_hook is not None:
220 result = object_pairs_hook(pairs)
221 return result, end
222 pairs = dict(pairs)
Christian Heimes90540002008-05-08 14:29:10 +0000223 if object_hook is not None:
224 pairs = object_hook(pairs)
225 return pairs, end
Christian Heimes90540002008-05-08 14:29:10 +0000226
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000227def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000228 s, end = s_and_end
Christian Heimes90540002008-05-08 14:29:10 +0000229 values = []
Christian Heimes90540002008-05-08 14:29:10 +0000230 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000231 if nextchar in _ws:
232 end = _w(s, end + 1).end()
233 nextchar = s[end:end + 1]
234 # Look-ahead for trivial empty array
Christian Heimes90540002008-05-08 14:29:10 +0000235 if nextchar == ']':
236 return values, end + 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000237 _append = values.append
Christian Heimes90540002008-05-08 14:29:10 +0000238 while True:
239 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000240 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000241 except StopIteration:
242 raise ValueError(errmsg("Expecting object", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000243 _append(value)
Christian Heimes90540002008-05-08 14:29:10 +0000244 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000245 if nextchar in _ws:
246 end = _w(s, end + 1).end()
247 nextchar = s[end:end + 1]
Christian Heimes90540002008-05-08 14:29:10 +0000248 end += 1
249 if nextchar == ']':
250 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000251 elif nextchar != ',':
Christian Heimes90540002008-05-08 14:29:10 +0000252 raise ValueError(errmsg("Expecting , delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000253 try:
254 if s[end] in _ws:
255 end += 1
256 if s[end] in _ws:
257 end = _w(s, end + 1).end()
258 except IndexError:
259 pass
260
Christian Heimes90540002008-05-08 14:29:10 +0000261 return values, end
Christian Heimes90540002008-05-08 14:29:10 +0000262
263
264class JSONDecoder(object):
265 """Simple JSON <http://json.org> decoder
266
267 Performs the following translations in decoding by default:
268
269 +---------------+-------------------+
270 | JSON | Python |
271 +===============+===================+
272 | object | dict |
273 +---------------+-------------------+
274 | array | list |
275 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000276 | string | str |
Christian Heimes90540002008-05-08 14:29:10 +0000277 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000278 | number (int) | int |
Christian Heimes90540002008-05-08 14:29:10 +0000279 +---------------+-------------------+
280 | number (real) | float |
281 +---------------+-------------------+
282 | true | True |
283 +---------------+-------------------+
284 | false | False |
285 +---------------+-------------------+
286 | null | None |
287 +---------------+-------------------+
288
289 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
290 their corresponding ``float`` values, which is outside the JSON spec.
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000291
Christian Heimes90540002008-05-08 14:29:10 +0000292 """
293
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000294 def __init__(self, object_hook=None, parse_float=None,
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000295 parse_int=None, parse_constant=None, strict=True,
296 object_pairs_hook=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000297 """``object_hook``, if specified, will be called with the result
298 of every JSON object decoded and its return value will be used in
Christian Heimes90540002008-05-08 14:29:10 +0000299 place of the given ``dict``. This can be used to provide custom
300 deserializations (e.g. to support JSON-RPC class hinting).
301
Georg Brandld4460aa2010-10-15 17:03:02 +0000302 ``object_pairs_hook``, if specified will be called with the result of
303 every JSON object decoded with an ordered list of pairs. The return
304 value of ``object_pairs_hook`` will be used instead of the ``dict``.
305 This feature can be used to implement custom decoders that rely on the
306 order that the key and value pairs are decoded (for example,
307 collections.OrderedDict will remember the order of insertion). If
308 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
309 priority.
310
Christian Heimes90540002008-05-08 14:29:10 +0000311 ``parse_float``, if specified, will be called with the string
312 of every JSON float to be decoded. By default this is equivalent to
313 float(num_str). This can be used to use another datatype or parser
314 for JSON floats (e.g. decimal.Decimal).
315
316 ``parse_int``, if specified, will be called with the string
317 of every JSON int to be decoded. By default this is equivalent to
318 int(num_str). This can be used to use another datatype or parser
319 for JSON integers (e.g. float).
320
321 ``parse_constant``, if specified, will be called with one of the
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000322 following strings: -Infinity, Infinity, NaN.
Christian Heimes90540002008-05-08 14:29:10 +0000323 This can be used to raise an exception if invalid JSON numbers
324 are encountered.
325
Georg Brandld4460aa2010-10-15 17:03:02 +0000326 If ``strict`` is false (true is the default), then control
327 characters will be allowed inside strings. Control characters in
328 this context are those with character codes in the 0-31 range,
329 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
330
Christian Heimes90540002008-05-08 14:29:10 +0000331 """
Christian Heimes90540002008-05-08 14:29:10 +0000332 self.object_hook = object_hook
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000333 self.parse_float = parse_float or float
334 self.parse_int = parse_int or int
335 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Christian Heimes90540002008-05-08 14:29:10 +0000336 self.strict = strict
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000337 self.object_pairs_hook = object_pairs_hook
338 self.parse_object = JSONObject
339 self.parse_array = JSONArray
340 self.parse_string = scanstring
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000341 self.memo = {}
Ezio Melotti6b60fb92011-05-14 06:47:51 +0300342 self.scan_once = scanner.make_scanner(self)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000343
Christian Heimes90540002008-05-08 14:29:10 +0000344
345 def decode(self, s, _w=WHITESPACE.match):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000346 """Return the Python representation of ``s`` (a ``str`` instance
347 containing a JSON document).
Christian Heimes90540002008-05-08 14:29:10 +0000348
349 """
350 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
351 end = _w(s, end).end()
352 if end != len(s):
353 raise ValueError(errmsg("Extra data", s, end, len(s)))
354 return obj
355
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000356 def raw_decode(self, s, idx=0):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000357 """Decode a JSON document from ``s`` (a ``str`` beginning with
358 a JSON document) and return a 2-tuple of the Python
Christian Heimes90540002008-05-08 14:29:10 +0000359 representation and the index in ``s`` where the document ended.
360
361 This can be used to decode a JSON document from a string that may
362 have extraneous data at the end.
363
364 """
Christian Heimes90540002008-05-08 14:29:10 +0000365 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000366 obj, end = self.scan_once(s, idx)
Christian Heimes90540002008-05-08 14:29:10 +0000367 except StopIteration:
368 raise ValueError("No JSON object could be decoded")
369 return obj, end