blob: d606cbd8c0f1159f84c26f640fa9c37eddfc42c1 [file] [log] [blame]
Christian Heimes90540002008-05-08 14:29:10 +00001"""Implementation of JSONDecoder
2"""
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00003import binascii
Christian Heimes90540002008-05-08 14:29:10 +00004import re
5import sys
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00006import struct
Christian Heimes90540002008-05-08 14:29:10 +00007
Benjamin Petersonc6b607d2009-05-02 12:36:44 +00008from json.scanner import make_scanner
Christian Heimes90540002008-05-08 14:29:10 +00009try:
10 from _json import scanstring as c_scanstring
11except ImportError:
12 c_scanstring = None
13
14__all__ = ['JSONDecoder']
15
16FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
17
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000018def _floatconstants():
19 _BYTES = binascii.unhexlify(b'7FF80000000000007FF0000000000000')
20 if sys.byteorder != 'big':
21 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
22 nan, inf = struct.unpack('dd', _BYTES)
23 return nan, inf, -inf
24
25NaN, PosInf, NegInf = _floatconstants()
Christian Heimes90540002008-05-08 14:29:10 +000026
27
28def linecol(doc, pos):
Benjamin Petersona13d4752008-10-16 21:17:24 +000029 if isinstance(doc, bytes):
30 newline = b'\n'
31 else:
32 newline = '\n'
33 lineno = doc.count(newline, 0, pos) + 1
Christian Heimes90540002008-05-08 14:29:10 +000034 if lineno == 1:
35 colno = pos
36 else:
Benjamin Petersona13d4752008-10-16 21:17:24 +000037 colno = pos - doc.rindex(newline, 0, pos)
Christian Heimes90540002008-05-08 14:29:10 +000038 return lineno, colno
39
40
41def errmsg(msg, doc, pos, end=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000042 # Note that this function is called from _json
Christian Heimes90540002008-05-08 14:29:10 +000043 lineno, colno = linecol(doc, pos)
44 if end is None:
45 fmt = '{0}: line {1} column {2} (char {3})'
46 return fmt.format(msg, lineno, colno, pos)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000047 #fmt = '%s: line %d column %d (char %d)'
48 #return fmt % (msg, lineno, colno, pos)
Christian Heimes90540002008-05-08 14:29:10 +000049 endlineno, endcolno = linecol(doc, end)
50 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
51 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000052 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
53 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Christian Heimes90540002008-05-08 14:29:10 +000054
55
56_CONSTANTS = {
57 '-Infinity': NegInf,
58 'Infinity': PosInf,
59 'NaN': NaN,
Christian Heimes90540002008-05-08 14:29:10 +000060}
61
62
Christian Heimes90540002008-05-08 14:29:10 +000063STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
64BACKSLASH = {
65 '"': '"', '\\': '\\', '/': '/',
66 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
67}
68
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000069def py_scanstring(s, end, strict=True,
70 _b=BACKSLASH, _m=STRINGCHUNK.match):
71 """Scan the string s for a JSON string. End is the index of the
72 character in s after the quote that started the JSON string.
73 Unescapes all valid JSON string escape sequences and raises ValueError
74 on attempt to decode an invalid string. If strict is False then literal
75 control characters are allowed in the string.
Christian Heimes90540002008-05-08 14:29:10 +000076
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000077 Returns a tuple of the decoded string and the index of the character in s
78 after the end quote."""
Christian Heimes90540002008-05-08 14:29:10 +000079 chunks = []
80 _append = chunks.append
81 begin = end - 1
82 while 1:
83 chunk = _m(s, end)
84 if chunk is None:
85 raise ValueError(
86 errmsg("Unterminated string starting at", s, begin))
87 end = chunk.end()
88 content, terminator = chunk.groups()
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000089 # Content is contains zero or more unescaped string characters
Christian Heimes90540002008-05-08 14:29:10 +000090 if content:
Christian Heimes90540002008-05-08 14:29:10 +000091 _append(content)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Christian Heimes90540002008-05-08 14:29:10 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000098 #msg = "Invalid control character %r at" % (terminator,)
Christian Heimes90540002008-05-08 14:29:10 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000109 # If not a unicode escape sequence, must be in the lookup table
Christian Heimes90540002008-05-08 14:29:10 +0000110 if esc != 'u':
111 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000112 char = _b[esc]
Christian Heimes90540002008-05-08 14:29:10 +0000113 except KeyError:
114 msg = "Invalid \\escape: {0!r}".format(esc)
115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
118 esc = s[end + 1:end + 5]
119 next_end = end + 5
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000120 if len(esc) != 4:
121 msg = "Invalid \\uXXXX escape"
Christian Heimes90540002008-05-08 14:29:10 +0000122 raise ValueError(errmsg(msg, s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000123 uni = int(esc, 16)
124 # Check for surrogate pair on UCS-4 systems
125 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
126 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
127 if not s[end + 5:end + 7] == '\\u':
128 raise ValueError(errmsg(msg, s, end))
129 esc2 = s[end + 7:end + 11]
130 if len(esc2) != 4:
131 raise ValueError(errmsg(msg, s, end))
132 uni2 = int(esc2, 16)
133 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
134 next_end += 6
135 char = chr(uni)
136
Christian Heimes90540002008-05-08 14:29:10 +0000137 end = next_end
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000138 _append(char)
Christian Heimes90540002008-05-08 14:29:10 +0000139 return ''.join(chunks), end
140
141
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000142# Use speedup if available
143scanstring = c_scanstring or py_scanstring
Christian Heimes90540002008-05-08 14:29:10 +0000144
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000145WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
146WHITESPACE_STR = ' \t\n\r'
Christian Heimes90540002008-05-08 14:29:10 +0000147
148
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000149def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
150 _w=WHITESPACE.match, _ws=WHITESPACE_STR):
151 s, end = s_and_end
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000152 pairs = []
153 pairs_append = pairs.append
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000154 # Use a slice to prevent IndexError from being raised, the following
155 # check will raise a more specific ValueError if the string is empty
Christian Heimes90540002008-05-08 14:29:10 +0000156 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000157 # Normally we expect nextchar == '"'
Christian Heimes90540002008-05-08 14:29:10 +0000158 if nextchar != '"':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000159 if nextchar in _ws:
160 end = _w(s, end).end()
161 nextchar = s[end:end + 1]
162 # Trivial empty object
163 if nextchar == '}':
164 return pairs, end + 1
165 elif nextchar != '"':
166 raise ValueError(errmsg("Expecting property name", s, end))
Christian Heimes90540002008-05-08 14:29:10 +0000167 end += 1
Christian Heimes90540002008-05-08 14:29:10 +0000168 while True:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000169 key, end = scanstring(s, end, strict)
170 # To skip some function call overhead we optimize the fast paths where
171 # the JSON key separator is ": " or just ":".
Christian Heimes90540002008-05-08 14:29:10 +0000172 if s[end:end + 1] != ':':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000173 end = _w(s, end).end()
174 if s[end:end + 1] != ':':
175 raise ValueError(errmsg("Expecting : delimiter", s, end))
176 end += 1
177
Christian Heimes90540002008-05-08 14:29:10 +0000178 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000179 if s[end] in _ws:
180 end += 1
181 if s[end] in _ws:
182 end = _w(s, end + 1).end()
183 except IndexError:
184 pass
185
186 try:
187 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000188 except StopIteration:
189 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000190 pairs_append((key, value))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000191 try:
192 nextchar = s[end]
193 if nextchar in _ws:
194 end = _w(s, end + 1).end()
195 nextchar = s[end]
196 except IndexError:
197 nextchar = ''
Christian Heimes90540002008-05-08 14:29:10 +0000198 end += 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000199
Christian Heimes90540002008-05-08 14:29:10 +0000200 if nextchar == '}':
201 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000202 elif nextchar != ',':
Christian Heimes90540002008-05-08 14:29:10 +0000203 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
204 end = _w(s, end).end()
205 nextchar = s[end:end + 1]
206 end += 1
207 if nextchar != '"':
208 raise ValueError(errmsg("Expecting property name", s, end - 1))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000209 if object_pairs_hook is not None:
210 result = object_pairs_hook(pairs)
211 return result, end
212 pairs = dict(pairs)
Christian Heimes90540002008-05-08 14:29:10 +0000213 if object_hook is not None:
214 pairs = object_hook(pairs)
215 return pairs, end
Christian Heimes90540002008-05-08 14:29:10 +0000216
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000217def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match):
218 s, end = s_and_end
Christian Heimes90540002008-05-08 14:29:10 +0000219 values = []
Christian Heimes90540002008-05-08 14:29:10 +0000220 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000221 if nextchar in _ws:
222 end = _w(s, end + 1).end()
223 nextchar = s[end:end + 1]
224 # Look-ahead for trivial empty array
Christian Heimes90540002008-05-08 14:29:10 +0000225 if nextchar == ']':
226 return values, end + 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000227 _append = values.append
Christian Heimes90540002008-05-08 14:29:10 +0000228 while True:
229 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000230 value, end = scan_once(s, end)
Christian Heimes90540002008-05-08 14:29:10 +0000231 except StopIteration:
232 raise ValueError(errmsg("Expecting object", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000233 _append(value)
Christian Heimes90540002008-05-08 14:29:10 +0000234 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000235 if nextchar in _ws:
236 end = _w(s, end + 1).end()
237 nextchar = s[end:end + 1]
Christian Heimes90540002008-05-08 14:29:10 +0000238 end += 1
239 if nextchar == ']':
240 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000241 elif nextchar != ',':
Christian Heimes90540002008-05-08 14:29:10 +0000242 raise ValueError(errmsg("Expecting , delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000243 try:
244 if s[end] in _ws:
245 end += 1
246 if s[end] in _ws:
247 end = _w(s, end + 1).end()
248 except IndexError:
249 pass
250
Christian Heimes90540002008-05-08 14:29:10 +0000251 return values, end
Christian Heimes90540002008-05-08 14:29:10 +0000252
253
254class JSONDecoder(object):
255 """Simple JSON <http://json.org> decoder
256
257 Performs the following translations in decoding by default:
258
259 +---------------+-------------------+
260 | JSON | Python |
261 +===============+===================+
262 | object | dict |
263 +---------------+-------------------+
264 | array | list |
265 +---------------+-------------------+
Georg Brandl4009c9e2010-10-06 08:26:09 +0000266 | string | str |
Christian Heimes90540002008-05-08 14:29:10 +0000267 +---------------+-------------------+
Georg Brandl4009c9e2010-10-06 08:26:09 +0000268 | number (int) | int |
Christian Heimes90540002008-05-08 14:29:10 +0000269 +---------------+-------------------+
270 | number (real) | float |
271 +---------------+-------------------+
272 | true | True |
273 +---------------+-------------------+
274 | false | False |
275 +---------------+-------------------+
276 | null | None |
277 +---------------+-------------------+
278
279 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
280 their corresponding ``float`` values, which is outside the JSON spec.
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000281
Christian Heimes90540002008-05-08 14:29:10 +0000282 """
283
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000284 def __init__(self, object_hook=None, parse_float=None,
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000285 parse_int=None, parse_constant=None, strict=True,
286 object_pairs_hook=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000287 """``object_hook``, if specified, will be called with the result
288 of every JSON object decoded and its return value will be used in
Christian Heimes90540002008-05-08 14:29:10 +0000289 place of the given ``dict``. This can be used to provide custom
290 deserializations (e.g. to support JSON-RPC class hinting).
291
Georg Brandlc524cff2010-11-26 08:42:45 +0000292 ``object_pairs_hook``, if specified will be called with the result of
293 every JSON object decoded with an ordered list of pairs. The return
294 value of ``object_pairs_hook`` will be used instead of the ``dict``.
295 This feature can be used to implement custom decoders that rely on the
296 order that the key and value pairs are decoded (for example,
297 collections.OrderedDict will remember the order of insertion). If
298 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
299 priority.
300
Christian Heimes90540002008-05-08 14:29:10 +0000301 ``parse_float``, if specified, will be called with the string
302 of every JSON float to be decoded. By default this is equivalent to
303 float(num_str). This can be used to use another datatype or parser
304 for JSON floats (e.g. decimal.Decimal).
305
306 ``parse_int``, if specified, will be called with the string
307 of every JSON int to be decoded. By default this is equivalent to
308 int(num_str). This can be used to use another datatype or parser
309 for JSON integers (e.g. float).
310
311 ``parse_constant``, if specified, will be called with one of the
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000312 following strings: -Infinity, Infinity, NaN.
Christian Heimes90540002008-05-08 14:29:10 +0000313 This can be used to raise an exception if invalid JSON numbers
314 are encountered.
315
Georg Brandlc524cff2010-11-26 08:42:45 +0000316 If ``strict`` is false (true is the default), then control
317 characters will be allowed inside strings. Control characters in
318 this context are those with character codes in the 0-31 range,
319 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
320
Christian Heimes90540002008-05-08 14:29:10 +0000321 """
Christian Heimes90540002008-05-08 14:29:10 +0000322 self.object_hook = object_hook
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000323 self.parse_float = parse_float or float
324 self.parse_int = parse_int or int
325 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Christian Heimes90540002008-05-08 14:29:10 +0000326 self.strict = strict
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000327 self.object_pairs_hook = object_pairs_hook
328 self.parse_object = JSONObject
329 self.parse_array = JSONArray
330 self.parse_string = scanstring
331 self.scan_once = make_scanner(self)
332
Christian Heimes90540002008-05-08 14:29:10 +0000333
334 def decode(self, s, _w=WHITESPACE.match):
Georg Brandl4009c9e2010-10-06 08:26:09 +0000335 """Return the Python representation of ``s`` (a ``str`` instance
336 containing a JSON document).
Christian Heimes90540002008-05-08 14:29:10 +0000337
338 """
339 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
340 end = _w(s, end).end()
341 if end != len(s):
342 raise ValueError(errmsg("Extra data", s, end, len(s)))
343 return obj
344
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000345 def raw_decode(self, s, idx=0):
Georg Brandl4009c9e2010-10-06 08:26:09 +0000346 """Decode a JSON document from ``s`` (a ``str`` beginning with
347 a JSON document) and return a 2-tuple of the Python
Christian Heimes90540002008-05-08 14:29:10 +0000348 representation and the index in ``s`` where the document ended.
349
350 This can be used to decode a JSON document from a string that may
351 have extraneous data at the end.
352
353 """
Christian Heimes90540002008-05-08 14:29:10 +0000354 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000355 obj, end = self.scan_once(s, idx)
Christian Heimes90540002008-05-08 14:29:10 +0000356 except StopIteration:
357 raise ValueError("No JSON object could be decoded")
358 return obj, end