blob: 5141f879d956e4715ea002197fc467b6eb640188 [file] [log] [blame]
Brett Cannon4b964f92008-05-05 20:21:38 +00001"""Implementation of JSONDecoder
2"""
Brett Cannon4b964f92008-05-05 20:21:38 +00003import re
4import sys
Bob Ippolitod914e3f2009-03-17 23:19:00 +00005import struct
Brett Cannon4b964f92008-05-05 20:21:38 +00006
Ezio Melottie3992eb2011-05-14 06:24:53 +03007from json import scanner
Brett Cannon4b964f92008-05-05 20:21:38 +00008try:
9 from _json import scanstring as c_scanstring
10except ImportError:
11 c_scanstring = None
12
13__all__ = ['JSONDecoder']
14
15FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
16
Bob Ippolitod914e3f2009-03-17 23:19:00 +000017def _floatconstants():
Serhiy Storchakac7797dc2015-05-31 20:21:00 +030018 nan, = struct.unpack('>d', b'\x7f\xf8\x00\x00\x00\x00\x00\x00')
19 inf, = struct.unpack('>d', b'\x7f\xf0\x00\x00\x00\x00\x00\x00')
Bob Ippolitod914e3f2009-03-17 23:19:00 +000020 return nan, inf, -inf
21
22NaN, PosInf, NegInf = _floatconstants()
Brett Cannon4b964f92008-05-05 20:21:38 +000023
24
25def linecol(doc, pos):
26 lineno = doc.count('\n', 0, pos) + 1
27 if lineno == 1:
Serhiy Storchaka49d40222013-02-21 20:17:54 +020028 colno = pos + 1
Brett Cannon4b964f92008-05-05 20:21:38 +000029 else:
30 colno = pos - doc.rindex('\n', 0, pos)
31 return lineno, colno
32
33
34def errmsg(msg, doc, pos, end=None):
Bob Ippolitod914e3f2009-03-17 23:19:00 +000035 # Note that this function is called from _json
Brett Cannon4b964f92008-05-05 20:21:38 +000036 lineno, colno = linecol(doc, pos)
37 if end is None:
38 fmt = '{0}: line {1} column {2} (char {3})'
39 return fmt.format(msg, lineno, colno, pos)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000040 #fmt = '%s: line %d column %d (char %d)'
41 #return fmt % (msg, lineno, colno, pos)
Brett Cannon4b964f92008-05-05 20:21:38 +000042 endlineno, endcolno = linecol(doc, end)
43 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
44 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000045 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
46 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Brett Cannon4b964f92008-05-05 20:21:38 +000047
48
49_CONSTANTS = {
50 '-Infinity': NegInf,
51 'Infinity': PosInf,
52 'NaN': NaN,
Brett Cannon4b964f92008-05-05 20:21:38 +000053}
54
Brett Cannon4b964f92008-05-05 20:21:38 +000055STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
56BACKSLASH = {
57 '"': u'"', '\\': u'\\', '/': u'/',
58 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
59}
60
61DEFAULT_ENCODING = "utf-8"
62
Serhiy Storchakadafda9b2013-11-26 21:25:15 +020063def _decode_uXXXX(s, pos):
64 esc = s[pos + 1:pos + 5]
65 if len(esc) == 4 and esc[1] not in 'xX':
66 try:
67 return int(esc, 16)
68 except ValueError:
69 pass
70 msg = "Invalid \\uXXXX escape"
71 raise ValueError(errmsg(msg, s, pos))
72
Bob Ippolitod914e3f2009-03-17 23:19:00 +000073def py_scanstring(s, end, encoding=None, strict=True,
74 _b=BACKSLASH, _m=STRINGCHUNK.match):
75 """Scan the string s for a JSON string. End is the index of the
76 character in s after the quote that started the JSON string.
77 Unescapes all valid JSON string escape sequences and raises ValueError
78 on attempt to decode an invalid string. If strict is False then literal
79 control characters are allowed in the string.
Brett Cannon4b964f92008-05-05 20:21:38 +000080
Bob Ippolitod914e3f2009-03-17 23:19:00 +000081 Returns a tuple of the decoded string and the index of the character in s
82 after the end quote."""
Brett Cannon4b964f92008-05-05 20:21:38 +000083 if encoding is None:
84 encoding = DEFAULT_ENCODING
85 chunks = []
86 _append = chunks.append
87 begin = end - 1
88 while 1:
89 chunk = _m(s, end)
90 if chunk is None:
91 raise ValueError(
92 errmsg("Unterminated string starting at", s, begin))
93 end = chunk.end()
94 content, terminator = chunk.groups()
Bob Ippolitod914e3f2009-03-17 23:19:00 +000095 # Content is contains zero or more unescaped string characters
Brett Cannon4b964f92008-05-05 20:21:38 +000096 if content:
97 if not isinstance(content, unicode):
98 content = unicode(content, encoding)
99 _append(content)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000100 # Terminator is the end of string, a literal control character,
101 # or a backslash denoting that an escape sequence follows
Brett Cannon4b964f92008-05-05 20:21:38 +0000102 if terminator == '"':
103 break
104 elif terminator != '\\':
105 if strict:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000106 #msg = "Invalid control character %r at" % (terminator,)
Brett Cannon4b964f92008-05-05 20:21:38 +0000107 msg = "Invalid control character {0!r} at".format(terminator)
108 raise ValueError(errmsg(msg, s, end))
109 else:
110 _append(terminator)
111 continue
112 try:
113 esc = s[end]
114 except IndexError:
115 raise ValueError(
116 errmsg("Unterminated string starting at", s, begin))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000117 # If not a unicode escape sequence, must be in the lookup table
Brett Cannon4b964f92008-05-05 20:21:38 +0000118 if esc != 'u':
119 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000120 char = _b[esc]
Brett Cannon4b964f92008-05-05 20:21:38 +0000121 except KeyError:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000122 msg = "Invalid \\escape: " + repr(esc)
Brett Cannon4b964f92008-05-05 20:21:38 +0000123 raise ValueError(errmsg(msg, s, end))
124 end += 1
125 else:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000126 # Unicode escape sequence
Serhiy Storchakadafda9b2013-11-26 21:25:15 +0200127 uni = _decode_uXXXX(s, end)
128 end += 5
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000129 # Check for surrogate pair on UCS-4 systems
Serhiy Storchakadafda9b2013-11-26 21:25:15 +0200130 if sys.maxunicode > 65535 and \
131 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
132 uni2 = _decode_uXXXX(s, end + 1)
133 if 0xdc00 <= uni2 <= 0xdfff:
134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
135 end += 6
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000136 char = unichr(uni)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000137 # Append the unescaped character
138 _append(char)
Brett Cannon4b964f92008-05-05 20:21:38 +0000139 return u''.join(chunks), end
140
141
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000142# Use speedup if available
143scanstring = c_scanstring or py_scanstring
Brett Cannon4b964f92008-05-05 20:21:38 +0000144
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000145WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
146WHITESPACE_STR = ' \t\n\r'
Brett Cannon4b964f92008-05-05 20:21:38 +0000147
Ezio Melottiffd84962010-01-26 15:57:21 +0000148def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000149 object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Ezio Melottiffd84962010-01-26 15:57:21 +0000150 s, end = s_and_end
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000151 pairs = []
152 pairs_append = pairs.append
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000153 # Use a slice to prevent IndexError from being raised, the following
154 # check will raise a more specific ValueError if the string is empty
Brett Cannon4b964f92008-05-05 20:21:38 +0000155 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000156 # Normally we expect nextchar == '"'
Brett Cannon4b964f92008-05-05 20:21:38 +0000157 if nextchar != '"':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000158 if nextchar in _ws:
159 end = _w(s, end).end()
160 nextchar = s[end:end + 1]
161 # Trivial empty object
162 if nextchar == '}':
Ezio Melotti2b96f092011-04-13 05:37:29 +0300163 if object_pairs_hook is not None:
164 result = object_pairs_hook(pairs)
Ezio Melottifda7a8c2013-03-13 01:49:57 +0200165 return result, end + 1
Ezio Melotti2b96f092011-04-13 05:37:29 +0300166 pairs = {}
167 if object_hook is not None:
168 pairs = object_hook(pairs)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000169 return pairs, end + 1
170 elif nextchar != '"':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200171 raise ValueError(errmsg(
172 "Expecting property name enclosed in double quotes", s, end))
Brett Cannon4b964f92008-05-05 20:21:38 +0000173 end += 1
Brett Cannon4b964f92008-05-05 20:21:38 +0000174 while True:
175 key, end = scanstring(s, end, encoding, strict)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000176
177 # To skip some function call overhead we optimize the fast paths where
178 # the JSON key separator is ": " or just ":".
Brett Cannon4b964f92008-05-05 20:21:38 +0000179 if s[end:end + 1] != ':':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000180 end = _w(s, end).end()
181 if s[end:end + 1] != ':':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200182 raise ValueError(errmsg("Expecting ':' delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000183 end += 1
184
Brett Cannon4b964f92008-05-05 20:21:38 +0000185 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000186 if s[end] in _ws:
187 end += 1
188 if s[end] in _ws:
189 end = _w(s, end + 1).end()
190 except IndexError:
191 pass
192
193 try:
194 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000195 except StopIteration:
196 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000197 pairs_append((key, value))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000198
199 try:
200 nextchar = s[end]
201 if nextchar in _ws:
202 end = _w(s, end + 1).end()
203 nextchar = s[end]
204 except IndexError:
205 nextchar = ''
Brett Cannon4b964f92008-05-05 20:21:38 +0000206 end += 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000207
Brett Cannon4b964f92008-05-05 20:21:38 +0000208 if nextchar == '}':
209 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000210 elif nextchar != ',':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200211 raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000212
213 try:
214 nextchar = s[end]
215 if nextchar in _ws:
216 end += 1
217 nextchar = s[end]
218 if nextchar in _ws:
219 end = _w(s, end + 1).end()
220 nextchar = s[end]
221 except IndexError:
222 nextchar = ''
223
Brett Cannon4b964f92008-05-05 20:21:38 +0000224 end += 1
225 if nextchar != '"':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200226 raise ValueError(errmsg(
227 "Expecting property name enclosed in double quotes", s, end - 1))
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000228 if object_pairs_hook is not None:
229 result = object_pairs_hook(pairs)
230 return result, end
231 pairs = dict(pairs)
Brett Cannon4b964f92008-05-05 20:21:38 +0000232 if object_hook is not None:
233 pairs = object_hook(pairs)
234 return pairs, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000235
Ezio Melottiffd84962010-01-26 15:57:21 +0000236def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
237 s, end = s_and_end
Brett Cannon4b964f92008-05-05 20:21:38 +0000238 values = []
Brett Cannon4b964f92008-05-05 20:21:38 +0000239 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000240 if nextchar in _ws:
241 end = _w(s, end + 1).end()
242 nextchar = s[end:end + 1]
243 # Look-ahead for trivial empty array
Brett Cannon4b964f92008-05-05 20:21:38 +0000244 if nextchar == ']':
245 return values, end + 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000246 _append = values.append
Brett Cannon4b964f92008-05-05 20:21:38 +0000247 while True:
248 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000249 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000250 except StopIteration:
251 raise ValueError(errmsg("Expecting object", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000252 _append(value)
Brett Cannon4b964f92008-05-05 20:21:38 +0000253 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000254 if nextchar in _ws:
255 end = _w(s, end + 1).end()
256 nextchar = s[end:end + 1]
Brett Cannon4b964f92008-05-05 20:21:38 +0000257 end += 1
258 if nextchar == ']':
259 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000260 elif nextchar != ',':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200261 raise ValueError(errmsg("Expecting ',' delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000262 try:
263 if s[end] in _ws:
264 end += 1
265 if s[end] in _ws:
266 end = _w(s, end + 1).end()
267 except IndexError:
268 pass
269
Brett Cannon4b964f92008-05-05 20:21:38 +0000270 return values, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000271
272class JSONDecoder(object):
273 """Simple JSON <http://json.org> decoder
274
275 Performs the following translations in decoding by default:
276
277 +---------------+-------------------+
278 | JSON | Python |
279 +===============+===================+
280 | object | dict |
281 +---------------+-------------------+
282 | array | list |
283 +---------------+-------------------+
284 | string | unicode |
285 +---------------+-------------------+
286 | number (int) | int, long |
287 +---------------+-------------------+
288 | number (real) | float |
289 +---------------+-------------------+
290 | true | True |
291 +---------------+-------------------+
292 | false | False |
293 +---------------+-------------------+
294 | null | None |
295 +---------------+-------------------+
296
297 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
298 their corresponding ``float`` values, which is outside the JSON spec.
Brett Cannon4b964f92008-05-05 20:21:38 +0000299
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000300 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000301
302 def __init__(self, encoding=None, object_hook=None, parse_float=None,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000303 parse_int=None, parse_constant=None, strict=True,
304 object_pairs_hook=None):
Brett Cannon4b964f92008-05-05 20:21:38 +0000305 """``encoding`` determines the encoding used to interpret any ``str``
306 objects decoded by this instance (utf-8 by default). It has no
307 effect when decoding ``unicode`` objects.
308
309 Note that currently only encodings that are a superset of ASCII work,
310 strings of other encodings should be passed in as ``unicode``.
311
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000312 ``object_hook``, if specified, will be called with the result
313 of every JSON object decoded and its return value will be used in
Brett Cannon4b964f92008-05-05 20:21:38 +0000314 place of the given ``dict``. This can be used to provide custom
315 deserializations (e.g. to support JSON-RPC class hinting).
316
Georg Brandldb949b82010-10-15 17:04:45 +0000317 ``object_pairs_hook``, if specified will be called with the result of
318 every JSON object decoded with an ordered list of pairs. The return
319 value of ``object_pairs_hook`` will be used instead of the ``dict``.
320 This feature can be used to implement custom decoders that rely on the
321 order that the key and value pairs are decoded (for example,
322 collections.OrderedDict will remember the order of insertion). If
323 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
324 priority.
325
Brett Cannon4b964f92008-05-05 20:21:38 +0000326 ``parse_float``, if specified, will be called with the string
327 of every JSON float to be decoded. By default this is equivalent to
328 float(num_str). This can be used to use another datatype or parser
329 for JSON floats (e.g. decimal.Decimal).
330
331 ``parse_int``, if specified, will be called with the string
332 of every JSON int to be decoded. By default this is equivalent to
333 int(num_str). This can be used to use another datatype or parser
334 for JSON integers (e.g. float).
335
336 ``parse_constant``, if specified, will be called with one of the
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000337 following strings: -Infinity, Infinity, NaN.
Brett Cannon4b964f92008-05-05 20:21:38 +0000338 This can be used to raise an exception if invalid JSON numbers
339 are encountered.
340
Georg Brandldb949b82010-10-15 17:04:45 +0000341 If ``strict`` is false (true is the default), then control
342 characters will be allowed inside strings. Control characters in
343 this context are those with character codes in the 0-31 range,
344 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
345
Brett Cannon4b964f92008-05-05 20:21:38 +0000346 """
347 self.encoding = encoding
348 self.object_hook = object_hook
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000349 self.object_pairs_hook = object_pairs_hook
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000350 self.parse_float = parse_float or float
351 self.parse_int = parse_int or int
352 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Brett Cannon4b964f92008-05-05 20:21:38 +0000353 self.strict = strict
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000354 self.parse_object = JSONObject
355 self.parse_array = JSONArray
356 self.parse_string = scanstring
Ezio Melottie3992eb2011-05-14 06:24:53 +0300357 self.scan_once = scanner.make_scanner(self)
Brett Cannon4b964f92008-05-05 20:21:38 +0000358
359 def decode(self, s, _w=WHITESPACE.match):
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000360 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
Brett Cannon4b964f92008-05-05 20:21:38 +0000361 instance containing a JSON document)
362
363 """
364 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
365 end = _w(s, end).end()
366 if end != len(s):
367 raise ValueError(errmsg("Extra data", s, end, len(s)))
368 return obj
369
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000370 def raw_decode(self, s, idx=0):
371 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
372 beginning with a JSON document) and return a 2-tuple of the Python
Brett Cannon4b964f92008-05-05 20:21:38 +0000373 representation and the index in ``s`` where the document ended.
374
375 This can be used to decode a JSON document from a string that may
376 have extraneous data at the end.
377
378 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000379 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000380 obj, end = self.scan_once(s, idx)
Brett Cannon4b964f92008-05-05 20:21:38 +0000381 except StopIteration:
382 raise ValueError("No JSON object could be decoded")
383 return obj, end