blob: 6596154d873bd9be2dabc458edc18c3d32715fc3 [file] [log] [blame]
Christian Heimes90540002008-05-08 14:29:10 +00001"""Implementation of JSONDecoder
2"""
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00003import binascii
Christian Heimes90540002008-05-08 14:29:10 +00004import re
5import sys
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00006import struct
Christian Heimes90540002008-05-08 14:29:10 +00007
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00008from json.scanner import make_scanner
Christian Heimes90540002008-05-08 14:29:10 +00009try:
10 from _json import scanstring as c_scanstring
11except ImportError:
12 c_scanstring = None
13
14__all__ = ['JSONDecoder']
15
16FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
17
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000018def _floatconstants():
19 _BYTES = binascii.unhexlify(b'7FF80000000000007FF0000000000000')
20 if sys.byteorder != 'big':
21 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
22 nan, inf = struct.unpack('dd', _BYTES)
23 return nan, inf, -inf
24
25NaN, PosInf, NegInf = _floatconstants()
Christian Heimes90540002008-05-08 14:29:10 +000026
27
28def linecol(doc, pos):
Benjamin Petersona13d4752008-10-16 21:17:24 +000029 if isinstance(doc, bytes):
30 newline = b'\n'
31 else:
32 newline = '\n'
33 lineno = doc.count(newline, 0, pos) + 1
Christian Heimes90540002008-05-08 14:29:10 +000034 if lineno == 1:
35 colno = pos
36 else:
Benjamin Petersona13d4752008-10-16 21:17:24 +000037 colno = pos - doc.rindex(newline, 0, pos)
Christian Heimes90540002008-05-08 14:29:10 +000038 return lineno, colno
39
40
41def errmsg(msg, doc, pos, end=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000042 # Note that this function is called from _json
Christian Heimes90540002008-05-08 14:29:10 +000043 lineno, colno = linecol(doc, pos)
44 if end is None:
45 fmt = '{0}: line {1} column {2} (char {3})'
46 return fmt.format(msg, lineno, colno, pos)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000047 #fmt = '%s: line %d column %d (char %d)'
48 #return fmt % (msg, lineno, colno, pos)
Christian Heimes90540002008-05-08 14:29:10 +000049 endlineno, endcolno = linecol(doc, end)
50 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
51 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000052 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
53 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Christian Heimes90540002008-05-08 14:29:10 +000054
55
56_CONSTANTS = {
57 '-Infinity': NegInf,
58 'Infinity': PosInf,
59 'NaN': NaN,
Christian Heimes90540002008-05-08 14:29:10 +000060}
61
62
Christian Heimes90540002008-05-08 14:29:10 +000063STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
64BACKSLASH = {
65 '"': '"', '\\': '\\', '/': '/',
66 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
67}
68
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000069def py_scanstring(s, end, strict=True,
70 _b=BACKSLASH, _m=STRINGCHUNK.match):
71 """Scan the string s for a JSON string. End is the index of the
72 character in s after the quote that started the JSON string.
73 Unescapes all valid JSON string escape sequences and raises ValueError
74 on attempt to decode an invalid string. If strict is False then literal
75 control characters are allowed in the string.
Christian Heimes90540002008-05-08 14:29:10 +000076
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000077 Returns a tuple of the decoded string and the index of the character in s
78 after the end quote."""
Christian Heimes90540002008-05-08 14:29:10 +000079 chunks = []
80 _append = chunks.append
81 begin = end - 1
82 while 1:
83 chunk = _m(s, end)
84 if chunk is None:
85 raise ValueError(
86 errmsg("Unterminated string starting at", s, begin))
87 end = chunk.end()
88 content, terminator = chunk.groups()
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000089 # Content is contains zero or more unescaped string characters
Christian Heimes90540002008-05-08 14:29:10 +000090 if content:
Christian Heimes90540002008-05-08 14:29:10 +000091 _append(content)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Christian Heimes90540002008-05-08 14:29:10 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000098 #msg = "Invalid control character %r at" % (terminator,)
Christian Heimes90540002008-05-08 14:29:10 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000109 # If not a unicode escape sequence, must be in the lookup table
Christian Heimes90540002008-05-08 14:29:10 +0000110 if esc != 'u':
111 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000112 char = _b[esc]
Christian Heimes90540002008-05-08 14:29:10 +0000113 except KeyError:
114 msg = "Invalid \\escape: {0!r}".format(esc)
115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
118 esc = s[end + 1:end + 5]
119 next_end = end + 5
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000120 if len(esc) != 4:
121 msg = "Invalid \\uXXXX escape"
Christian Heimes90540002008-05-08 14:29:10 +0000122 raise ValueError(errmsg(msg, s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000123 uni = int(esc, 16)
124 # Check for surrogate pair on UCS-4 systems
125 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
126 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
127 if not s[end + 5:end + 7] == '\\u':
128 raise ValueError(errmsg(msg, s, end))
129 esc2 = s[end + 7:end + 11]
130 if len(esc2) != 4:
131 raise ValueError(errmsg(msg, s, end))
132 uni2 = int(esc2, 16)
133 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
134 next_end += 6
135 char = chr(uni)
136
Christian Heimes90540002008-05-08 14:29:10 +0000137 end = next_end
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000138 _append(char)
Christian Heimes90540002008-05-08 14:29:10 +0000139 return ''.join(chunks), end
140
141
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000142# Use speedup if available
143scanstring = c_scanstring or py_scanstring
Christian Heimes90540002008-05-08 14:29:10 +0000144
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000145WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
146WHITESPACE_STR = ' \t\n\r'
Christian Heimes90540002008-05-08 14:29:10 +0000147
148
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000149def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000150 memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000151 s, end = s_and_end
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000152 pairs = []
153 pairs_append = pairs.append
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000154 # Backwards compatibility
155 if memo is None:
156 memo = {}
157 memo_get = memo.setdefault
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000158 # Use a slice to prevent IndexError from being raised, the following
159 # check will raise a more specific ValueError if the string is empty
Christian Heimes90540002008-05-08 14:29:10 +0000160 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000161 # Normally we expect nextchar == '"'
Christian Heimes90540002008-05-08 14:29:10 +0000162 if nextchar != '"':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000163 if nextchar in _ws:
164 end = _w(s, end).end()
165 nextchar = s[end:end + 1]
166 # Trivial empty object
167 if nextchar == '}':
168 return pairs, end + 1
169 elif nextchar != '"':
170 raise ValueError(errmsg("Expecting property name", s, end))
Christian Heimes90540002008-05-08 14:29:10 +0000171 end += 1
Christian Heimes90540002008-05-08 14:29:10 +0000172 while True:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000173 key, end = scanstring(s, end, strict)
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000174 key = memo_get(key, key)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000175 # To skip some function call overhead we optimize the fast paths where
176 # the JSON key separator is ": " or just ":".
Christian Heimes90540002008-05-08 14:29:10 +0000177 if s[end:end + 1] != ':':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000178 end = _w(s, end).end()
179 if s[end:end + 1] != ':':
180 raise ValueError(errmsg("Expecting : delimiter", s, end))
181 end += 1
182
Christian Heimes90540002008-05-08 14:29:10 +0000183 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000184 if s[end] in _ws:
185 end += 1
186 if s[end] in _ws:
187 end = _w(s, end + 1).end()
188 except IndexError:
189 pass
190
191 try:
192 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000193 except StopIteration:
194 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000195 pairs_append((key, value))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000196 try:
197 nextchar = s[end]
198 if nextchar in _ws:
199 end = _w(s, end + 1).end()
200 nextchar = s[end]
201 except IndexError:
202 nextchar = ''
Christian Heimes90540002008-05-08 14:29:10 +0000203 end += 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000204
Christian Heimes90540002008-05-08 14:29:10 +0000205 if nextchar == '}':
206 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000207 elif nextchar != ',':
Christian Heimes90540002008-05-08 14:29:10 +0000208 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
209 end = _w(s, end).end()
210 nextchar = s[end:end + 1]
211 end += 1
212 if nextchar != '"':
213 raise ValueError(errmsg("Expecting property name", s, end - 1))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000214 if object_pairs_hook is not None:
215 result = object_pairs_hook(pairs)
216 return result, end
217 pairs = dict(pairs)
Christian Heimes90540002008-05-08 14:29:10 +0000218 if object_hook is not None:
219 pairs = object_hook(pairs)
220 return pairs, end
Christian Heimes90540002008-05-08 14:29:10 +0000221
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000222def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000223 s, end = s_and_end
Christian Heimes90540002008-05-08 14:29:10 +0000224 values = []
Christian Heimes90540002008-05-08 14:29:10 +0000225 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000226 if nextchar in _ws:
227 end = _w(s, end + 1).end()
228 nextchar = s[end:end + 1]
229 # Look-ahead for trivial empty array
Christian Heimes90540002008-05-08 14:29:10 +0000230 if nextchar == ']':
231 return values, end + 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000232 _append = values.append
Christian Heimes90540002008-05-08 14:29:10 +0000233 while True:
234 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000235 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000236 except StopIteration:
237 raise ValueError(errmsg("Expecting object", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000238 _append(value)
Christian Heimes90540002008-05-08 14:29:10 +0000239 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000240 if nextchar in _ws:
241 end = _w(s, end + 1).end()
242 nextchar = s[end:end + 1]
Christian Heimes90540002008-05-08 14:29:10 +0000243 end += 1
244 if nextchar == ']':
245 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000246 elif nextchar != ',':
Christian Heimes90540002008-05-08 14:29:10 +0000247 raise ValueError(errmsg("Expecting , delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000248 try:
249 if s[end] in _ws:
250 end += 1
251 if s[end] in _ws:
252 end = _w(s, end + 1).end()
253 except IndexError:
254 pass
255
Christian Heimes90540002008-05-08 14:29:10 +0000256 return values, end
Christian Heimes90540002008-05-08 14:29:10 +0000257
258
259class JSONDecoder(object):
260 """Simple JSON <http://json.org> decoder
261
262 Performs the following translations in decoding by default:
263
264 +---------------+-------------------+
265 | JSON | Python |
266 +===============+===================+
267 | object | dict |
268 +---------------+-------------------+
269 | array | list |
270 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000271 | string | str |
Christian Heimes90540002008-05-08 14:29:10 +0000272 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000273 | number (int) | int |
Christian Heimes90540002008-05-08 14:29:10 +0000274 +---------------+-------------------+
275 | number (real) | float |
276 +---------------+-------------------+
277 | true | True |
278 +---------------+-------------------+
279 | false | False |
280 +---------------+-------------------+
281 | null | None |
282 +---------------+-------------------+
283
284 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
285 their corresponding ``float`` values, which is outside the JSON spec.
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000286
Christian Heimes90540002008-05-08 14:29:10 +0000287 """
288
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000289 def __init__(self, object_hook=None, parse_float=None,
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000290 parse_int=None, parse_constant=None, strict=True,
291 object_pairs_hook=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000292 """``object_hook``, if specified, will be called with the result
293 of every JSON object decoded and its return value will be used in
Christian Heimes90540002008-05-08 14:29:10 +0000294 place of the given ``dict``. This can be used to provide custom
295 deserializations (e.g. to support JSON-RPC class hinting).
296
297 ``parse_float``, if specified, will be called with the string
298 of every JSON float to be decoded. By default this is equivalent to
299 float(num_str). This can be used to use another datatype or parser
300 for JSON floats (e.g. decimal.Decimal).
301
302 ``parse_int``, if specified, will be called with the string
303 of every JSON int to be decoded. By default this is equivalent to
304 int(num_str). This can be used to use another datatype or parser
305 for JSON integers (e.g. float).
306
307 ``parse_constant``, if specified, will be called with one of the
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000308 following strings: -Infinity, Infinity, NaN.
Christian Heimes90540002008-05-08 14:29:10 +0000309 This can be used to raise an exception if invalid JSON numbers
310 are encountered.
311
312 """
Christian Heimes90540002008-05-08 14:29:10 +0000313 self.object_hook = object_hook
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000314 self.parse_float = parse_float or float
315 self.parse_int = parse_int or int
316 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Christian Heimes90540002008-05-08 14:29:10 +0000317 self.strict = strict
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000318 self.object_pairs_hook = object_pairs_hook
319 self.parse_object = JSONObject
320 self.parse_array = JSONArray
321 self.parse_string = scanstring
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000322 self.memo = {}
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000323 self.scan_once = make_scanner(self)
324
Christian Heimes90540002008-05-08 14:29:10 +0000325
326 def decode(self, s, _w=WHITESPACE.match):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000327 """Return the Python representation of ``s`` (a ``str`` instance
328 containing a JSON document).
Christian Heimes90540002008-05-08 14:29:10 +0000329
330 """
331 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
332 end = _w(s, end).end()
333 if end != len(s):
334 raise ValueError(errmsg("Extra data", s, end, len(s)))
335 return obj
336
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000337 def raw_decode(self, s, idx=0):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000338 """Decode a JSON document from ``s`` (a ``str`` beginning with
339 a JSON document) and return a 2-tuple of the Python
Christian Heimes90540002008-05-08 14:29:10 +0000340 representation and the index in ``s`` where the document ended.
341
342 This can be used to decode a JSON document from a string that may
343 have extraneous data at the end.
344
345 """
Christian Heimes90540002008-05-08 14:29:10 +0000346 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000347 obj, end = self.scan_once(s, idx)
Christian Heimes90540002008-05-08 14:29:10 +0000348 except StopIteration:
349 raise ValueError("No JSON object could be decoded")
350 return obj, end