blob: 07fd696400b3f05b929b557273f5ff5b7e6142e8 [file] [log] [blame]
Christian Heimes90540002008-05-08 14:29:10 +00001"""Implementation of JSONDecoder
2"""
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00003import binascii
Christian Heimes90540002008-05-08 14:29:10 +00004import re
5import sys
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00006import struct
Christian Heimes90540002008-05-08 14:29:10 +00007
Ezio Melotti6b60fb92011-05-14 06:47:51 +03008from json import scanner
Christian Heimes90540002008-05-08 14:29:10 +00009try:
10 from _json import scanstring as c_scanstring
11except ImportError:
12 c_scanstring = None
13
14__all__ = ['JSONDecoder']
15
16FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
17
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000018def _floatconstants():
19 _BYTES = binascii.unhexlify(b'7FF80000000000007FF0000000000000')
20 if sys.byteorder != 'big':
21 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
22 nan, inf = struct.unpack('dd', _BYTES)
23 return nan, inf, -inf
24
25NaN, PosInf, NegInf = _floatconstants()
Christian Heimes90540002008-05-08 14:29:10 +000026
27
28def linecol(doc, pos):
Benjamin Petersona13d4752008-10-16 21:17:24 +000029 if isinstance(doc, bytes):
30 newline = b'\n'
31 else:
32 newline = '\n'
33 lineno = doc.count(newline, 0, pos) + 1
Christian Heimes90540002008-05-08 14:29:10 +000034 if lineno == 1:
35 colno = pos
36 else:
Benjamin Petersona13d4752008-10-16 21:17:24 +000037 colno = pos - doc.rindex(newline, 0, pos)
Christian Heimes90540002008-05-08 14:29:10 +000038 return lineno, colno
39
40
41def errmsg(msg, doc, pos, end=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000042 # Note that this function is called from _json
Christian Heimes90540002008-05-08 14:29:10 +000043 lineno, colno = linecol(doc, pos)
44 if end is None:
45 fmt = '{0}: line {1} column {2} (char {3})'
46 return fmt.format(msg, lineno, colno, pos)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000047 #fmt = '%s: line %d column %d (char %d)'
48 #return fmt % (msg, lineno, colno, pos)
Christian Heimes90540002008-05-08 14:29:10 +000049 endlineno, endcolno = linecol(doc, end)
50 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
51 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000052 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
53 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Christian Heimes90540002008-05-08 14:29:10 +000054
55
56_CONSTANTS = {
57 '-Infinity': NegInf,
58 'Infinity': PosInf,
59 'NaN': NaN,
Christian Heimes90540002008-05-08 14:29:10 +000060}
61
62
Christian Heimes90540002008-05-08 14:29:10 +000063STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
64BACKSLASH = {
65 '"': '"', '\\': '\\', '/': '/',
66 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
67}
68
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000069def py_scanstring(s, end, strict=True,
70 _b=BACKSLASH, _m=STRINGCHUNK.match):
71 """Scan the string s for a JSON string. End is the index of the
72 character in s after the quote that started the JSON string.
73 Unescapes all valid JSON string escape sequences and raises ValueError
74 on attempt to decode an invalid string. If strict is False then literal
75 control characters are allowed in the string.
Christian Heimes90540002008-05-08 14:29:10 +000076
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000077 Returns a tuple of the decoded string and the index of the character in s
78 after the end quote."""
Christian Heimes90540002008-05-08 14:29:10 +000079 chunks = []
80 _append = chunks.append
81 begin = end - 1
82 while 1:
83 chunk = _m(s, end)
84 if chunk is None:
85 raise ValueError(
86 errmsg("Unterminated string starting at", s, begin))
87 end = chunk.end()
88 content, terminator = chunk.groups()
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000089 # Content is contains zero or more unescaped string characters
Christian Heimes90540002008-05-08 14:29:10 +000090 if content:
Christian Heimes90540002008-05-08 14:29:10 +000091 _append(content)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Christian Heimes90540002008-05-08 14:29:10 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000098 #msg = "Invalid control character %r at" % (terminator,)
Christian Heimes90540002008-05-08 14:29:10 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000109 # If not a unicode escape sequence, must be in the lookup table
Christian Heimes90540002008-05-08 14:29:10 +0000110 if esc != 'u':
111 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000112 char = _b[esc]
Christian Heimes90540002008-05-08 14:29:10 +0000113 except KeyError:
114 msg = "Invalid \\escape: {0!r}".format(esc)
115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
118 esc = s[end + 1:end + 5]
119 next_end = end + 5
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000120 if len(esc) != 4:
121 msg = "Invalid \\uXXXX escape"
Christian Heimes90540002008-05-08 14:29:10 +0000122 raise ValueError(errmsg(msg, s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000123 uni = int(esc, 16)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 if 0xd800 <= uni <= 0xdbff:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000125 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
126 if not s[end + 5:end + 7] == '\\u':
127 raise ValueError(errmsg(msg, s, end))
128 esc2 = s[end + 7:end + 11]
129 if len(esc2) != 4:
130 raise ValueError(errmsg(msg, s, end))
131 uni2 = int(esc2, 16)
132 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
133 next_end += 6
134 char = chr(uni)
135
Christian Heimes90540002008-05-08 14:29:10 +0000136 end = next_end
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000137 _append(char)
Christian Heimes90540002008-05-08 14:29:10 +0000138 return ''.join(chunks), end
139
140
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000141# Use speedup if available
142scanstring = c_scanstring or py_scanstring
Christian Heimes90540002008-05-08 14:29:10 +0000143
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000144WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
145WHITESPACE_STR = ' \t\n\r'
Christian Heimes90540002008-05-08 14:29:10 +0000146
147
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000148def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000149 memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000150 s, end = s_and_end
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000151 pairs = []
152 pairs_append = pairs.append
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000153 # Backwards compatibility
154 if memo is None:
155 memo = {}
156 memo_get = memo.setdefault
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000157 # Use a slice to prevent IndexError from being raised, the following
158 # check will raise a more specific ValueError if the string is empty
Christian Heimes90540002008-05-08 14:29:10 +0000159 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000160 # Normally we expect nextchar == '"'
Christian Heimes90540002008-05-08 14:29:10 +0000161 if nextchar != '"':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000162 if nextchar in _ws:
163 end = _w(s, end).end()
164 nextchar = s[end:end + 1]
165 # Trivial empty object
166 if nextchar == '}':
Ezio Melottid210aa12011-04-13 07:10:13 +0300167 if object_pairs_hook is not None:
168 result = object_pairs_hook(pairs)
169 return result, end
170 pairs = {}
171 if object_hook is not None:
172 pairs = object_hook(pairs)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000173 return pairs, end + 1
174 elif nextchar != '"':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200175 raise ValueError(errmsg(
176 "Expecting property name enclosed in double quotes", s, end))
Christian Heimes90540002008-05-08 14:29:10 +0000177 end += 1
Christian Heimes90540002008-05-08 14:29:10 +0000178 while True:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000179 key, end = scanstring(s, end, strict)
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000180 key = memo_get(key, key)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000181 # To skip some function call overhead we optimize the fast paths where
182 # the JSON key separator is ": " or just ":".
Christian Heimes90540002008-05-08 14:29:10 +0000183 if s[end:end + 1] != ':':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000184 end = _w(s, end).end()
185 if s[end:end + 1] != ':':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200186 raise ValueError(errmsg("Expecting ':' delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000187 end += 1
188
Christian Heimes90540002008-05-08 14:29:10 +0000189 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000190 if s[end] in _ws:
191 end += 1
192 if s[end] in _ws:
193 end = _w(s, end + 1).end()
194 except IndexError:
195 pass
196
197 try:
198 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000199 except StopIteration:
200 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000201 pairs_append((key, value))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000202 try:
203 nextchar = s[end]
204 if nextchar in _ws:
205 end = _w(s, end + 1).end()
206 nextchar = s[end]
207 except IndexError:
208 nextchar = ''
Christian Heimes90540002008-05-08 14:29:10 +0000209 end += 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000210
Christian Heimes90540002008-05-08 14:29:10 +0000211 if nextchar == '}':
212 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000213 elif nextchar != ',':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200214 raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
Christian Heimes90540002008-05-08 14:29:10 +0000215 end = _w(s, end).end()
216 nextchar = s[end:end + 1]
217 end += 1
218 if nextchar != '"':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200219 raise ValueError(errmsg(
220 "Expecting property name enclosed in double quotes", s, end - 1))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000221 if object_pairs_hook is not None:
222 result = object_pairs_hook(pairs)
223 return result, end
224 pairs = dict(pairs)
Christian Heimes90540002008-05-08 14:29:10 +0000225 if object_hook is not None:
226 pairs = object_hook(pairs)
227 return pairs, end
Christian Heimes90540002008-05-08 14:29:10 +0000228
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000229def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000230 s, end = s_and_end
Christian Heimes90540002008-05-08 14:29:10 +0000231 values = []
Christian Heimes90540002008-05-08 14:29:10 +0000232 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000233 if nextchar in _ws:
234 end = _w(s, end + 1).end()
235 nextchar = s[end:end + 1]
236 # Look-ahead for trivial empty array
Christian Heimes90540002008-05-08 14:29:10 +0000237 if nextchar == ']':
238 return values, end + 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000239 _append = values.append
Christian Heimes90540002008-05-08 14:29:10 +0000240 while True:
241 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000242 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000243 except StopIteration:
244 raise ValueError(errmsg("Expecting object", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000245 _append(value)
Christian Heimes90540002008-05-08 14:29:10 +0000246 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000247 if nextchar in _ws:
248 end = _w(s, end + 1).end()
249 nextchar = s[end:end + 1]
Christian Heimes90540002008-05-08 14:29:10 +0000250 end += 1
251 if nextchar == ']':
252 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000253 elif nextchar != ',':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200254 raise ValueError(errmsg("Expecting ',' delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000255 try:
256 if s[end] in _ws:
257 end += 1
258 if s[end] in _ws:
259 end = _w(s, end + 1).end()
260 except IndexError:
261 pass
262
Christian Heimes90540002008-05-08 14:29:10 +0000263 return values, end
Christian Heimes90540002008-05-08 14:29:10 +0000264
265
266class JSONDecoder(object):
267 """Simple JSON <http://json.org> decoder
268
269 Performs the following translations in decoding by default:
270
271 +---------------+-------------------+
272 | JSON | Python |
273 +===============+===================+
274 | object | dict |
275 +---------------+-------------------+
276 | array | list |
277 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000278 | string | str |
Christian Heimes90540002008-05-08 14:29:10 +0000279 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000280 | number (int) | int |
Christian Heimes90540002008-05-08 14:29:10 +0000281 +---------------+-------------------+
282 | number (real) | float |
283 +---------------+-------------------+
284 | true | True |
285 +---------------+-------------------+
286 | false | False |
287 +---------------+-------------------+
288 | null | None |
289 +---------------+-------------------+
290
291 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
292 their corresponding ``float`` values, which is outside the JSON spec.
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000293
Christian Heimes90540002008-05-08 14:29:10 +0000294 """
295
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000296 def __init__(self, object_hook=None, parse_float=None,
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000297 parse_int=None, parse_constant=None, strict=True,
298 object_pairs_hook=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000299 """``object_hook``, if specified, will be called with the result
300 of every JSON object decoded and its return value will be used in
Christian Heimes90540002008-05-08 14:29:10 +0000301 place of the given ``dict``. This can be used to provide custom
302 deserializations (e.g. to support JSON-RPC class hinting).
303
Georg Brandld4460aa2010-10-15 17:03:02 +0000304 ``object_pairs_hook``, if specified will be called with the result of
305 every JSON object decoded with an ordered list of pairs. The return
306 value of ``object_pairs_hook`` will be used instead of the ``dict``.
307 This feature can be used to implement custom decoders that rely on the
308 order that the key and value pairs are decoded (for example,
309 collections.OrderedDict will remember the order of insertion). If
310 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
311 priority.
312
Christian Heimes90540002008-05-08 14:29:10 +0000313 ``parse_float``, if specified, will be called with the string
314 of every JSON float to be decoded. By default this is equivalent to
315 float(num_str). This can be used to use another datatype or parser
316 for JSON floats (e.g. decimal.Decimal).
317
318 ``parse_int``, if specified, will be called with the string
319 of every JSON int to be decoded. By default this is equivalent to
320 int(num_str). This can be used to use another datatype or parser
321 for JSON integers (e.g. float).
322
323 ``parse_constant``, if specified, will be called with one of the
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000324 following strings: -Infinity, Infinity, NaN.
Christian Heimes90540002008-05-08 14:29:10 +0000325 This can be used to raise an exception if invalid JSON numbers
326 are encountered.
327
Georg Brandld4460aa2010-10-15 17:03:02 +0000328 If ``strict`` is false (true is the default), then control
329 characters will be allowed inside strings. Control characters in
330 this context are those with character codes in the 0-31 range,
331 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
332
Christian Heimes90540002008-05-08 14:29:10 +0000333 """
Christian Heimes90540002008-05-08 14:29:10 +0000334 self.object_hook = object_hook
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000335 self.parse_float = parse_float or float
336 self.parse_int = parse_int or int
337 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Christian Heimes90540002008-05-08 14:29:10 +0000338 self.strict = strict
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000339 self.object_pairs_hook = object_pairs_hook
340 self.parse_object = JSONObject
341 self.parse_array = JSONArray
342 self.parse_string = scanstring
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000343 self.memo = {}
Ezio Melotti6b60fb92011-05-14 06:47:51 +0300344 self.scan_once = scanner.make_scanner(self)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000345
Christian Heimes90540002008-05-08 14:29:10 +0000346
347 def decode(self, s, _w=WHITESPACE.match):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000348 """Return the Python representation of ``s`` (a ``str`` instance
349 containing a JSON document).
Christian Heimes90540002008-05-08 14:29:10 +0000350
351 """
352 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
353 end = _w(s, end).end()
354 if end != len(s):
355 raise ValueError(errmsg("Extra data", s, end, len(s)))
356 return obj
357
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000358 def raw_decode(self, s, idx=0):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000359 """Decode a JSON document from ``s`` (a ``str`` beginning with
360 a JSON document) and return a 2-tuple of the Python
Christian Heimes90540002008-05-08 14:29:10 +0000361 representation and the index in ``s`` where the document ended.
362
363 This can be used to decode a JSON document from a string that may
364 have extraneous data at the end.
365
366 """
Christian Heimes90540002008-05-08 14:29:10 +0000367 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000368 obj, end = self.scan_once(s, idx)
Christian Heimes90540002008-05-08 14:29:10 +0000369 except StopIteration:
370 raise ValueError("No JSON object could be decoded")
371 return obj, end