blob: 1b4323839b96b07ee60291dec1d9d6b8e1d4437c [file] [log] [blame]
Brett Cannon4b964f92008-05-05 20:21:38 +00001"""Implementation of JSONDecoder
2"""
Brett Cannon4b964f92008-05-05 20:21:38 +00003import re
4import sys
Bob Ippolitod914e3f2009-03-17 23:19:00 +00005import struct
Brett Cannon4b964f92008-05-05 20:21:38 +00006
Ezio Melottie3992eb2011-05-14 06:24:53 +03007from json import scanner
Brett Cannon4b964f92008-05-05 20:21:38 +00008try:
9 from _json import scanstring as c_scanstring
10except ImportError:
11 c_scanstring = None
12
13__all__ = ['JSONDecoder']
14
15FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
16
Bob Ippolitod914e3f2009-03-17 23:19:00 +000017def _floatconstants():
18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
19 if sys.byteorder != 'big':
20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
21 nan, inf = struct.unpack('dd', _BYTES)
22 return nan, inf, -inf
23
24NaN, PosInf, NegInf = _floatconstants()
Brett Cannon4b964f92008-05-05 20:21:38 +000025
26
27def linecol(doc, pos):
28 lineno = doc.count('\n', 0, pos) + 1
29 if lineno == 1:
Serhiy Storchaka49d40222013-02-21 20:17:54 +020030 colno = pos + 1
Brett Cannon4b964f92008-05-05 20:21:38 +000031 else:
32 colno = pos - doc.rindex('\n', 0, pos)
33 return lineno, colno
34
35
36def errmsg(msg, doc, pos, end=None):
Bob Ippolitod914e3f2009-03-17 23:19:00 +000037 # Note that this function is called from _json
Brett Cannon4b964f92008-05-05 20:21:38 +000038 lineno, colno = linecol(doc, pos)
39 if end is None:
40 fmt = '{0}: line {1} column {2} (char {3})'
41 return fmt.format(msg, lineno, colno, pos)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000042 #fmt = '%s: line %d column %d (char %d)'
43 #return fmt % (msg, lineno, colno, pos)
Brett Cannon4b964f92008-05-05 20:21:38 +000044 endlineno, endcolno = linecol(doc, end)
45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000047 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Brett Cannon4b964f92008-05-05 20:21:38 +000049
50
51_CONSTANTS = {
52 '-Infinity': NegInf,
53 'Infinity': PosInf,
54 'NaN': NaN,
Brett Cannon4b964f92008-05-05 20:21:38 +000055}
56
Brett Cannon4b964f92008-05-05 20:21:38 +000057STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
58BACKSLASH = {
59 '"': u'"', '\\': u'\\', '/': u'/',
60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
61}
62
63DEFAULT_ENCODING = "utf-8"
64
Serhiy Storchakadafda9b2013-11-26 21:25:15 +020065def _decode_uXXXX(s, pos):
66 esc = s[pos + 1:pos + 5]
67 if len(esc) == 4 and esc[1] not in 'xX':
68 try:
69 return int(esc, 16)
70 except ValueError:
71 pass
72 msg = "Invalid \\uXXXX escape"
73 raise ValueError(errmsg(msg, s, pos))
74
Bob Ippolitod914e3f2009-03-17 23:19:00 +000075def py_scanstring(s, end, encoding=None, strict=True,
76 _b=BACKSLASH, _m=STRINGCHUNK.match):
77 """Scan the string s for a JSON string. End is the index of the
78 character in s after the quote that started the JSON string.
79 Unescapes all valid JSON string escape sequences and raises ValueError
80 on attempt to decode an invalid string. If strict is False then literal
81 control characters are allowed in the string.
Brett Cannon4b964f92008-05-05 20:21:38 +000082
Bob Ippolitod914e3f2009-03-17 23:19:00 +000083 Returns a tuple of the decoded string and the index of the character in s
84 after the end quote."""
Brett Cannon4b964f92008-05-05 20:21:38 +000085 if encoding is None:
86 encoding = DEFAULT_ENCODING
87 chunks = []
88 _append = chunks.append
89 begin = end - 1
90 while 1:
91 chunk = _m(s, end)
92 if chunk is None:
93 raise ValueError(
94 errmsg("Unterminated string starting at", s, begin))
95 end = chunk.end()
96 content, terminator = chunk.groups()
Bob Ippolitod914e3f2009-03-17 23:19:00 +000097 # Content is contains zero or more unescaped string characters
Brett Cannon4b964f92008-05-05 20:21:38 +000098 if content:
99 if not isinstance(content, unicode):
100 content = unicode(content, encoding)
101 _append(content)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000102 # Terminator is the end of string, a literal control character,
103 # or a backslash denoting that an escape sequence follows
Brett Cannon4b964f92008-05-05 20:21:38 +0000104 if terminator == '"':
105 break
106 elif terminator != '\\':
107 if strict:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000108 #msg = "Invalid control character %r at" % (terminator,)
Brett Cannon4b964f92008-05-05 20:21:38 +0000109 msg = "Invalid control character {0!r} at".format(terminator)
110 raise ValueError(errmsg(msg, s, end))
111 else:
112 _append(terminator)
113 continue
114 try:
115 esc = s[end]
116 except IndexError:
117 raise ValueError(
118 errmsg("Unterminated string starting at", s, begin))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000119 # If not a unicode escape sequence, must be in the lookup table
Brett Cannon4b964f92008-05-05 20:21:38 +0000120 if esc != 'u':
121 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000122 char = _b[esc]
Brett Cannon4b964f92008-05-05 20:21:38 +0000123 except KeyError:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000124 msg = "Invalid \\escape: " + repr(esc)
Brett Cannon4b964f92008-05-05 20:21:38 +0000125 raise ValueError(errmsg(msg, s, end))
126 end += 1
127 else:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000128 # Unicode escape sequence
Serhiy Storchakadafda9b2013-11-26 21:25:15 +0200129 uni = _decode_uXXXX(s, end)
130 end += 5
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000131 # Check for surrogate pair on UCS-4 systems
Serhiy Storchakadafda9b2013-11-26 21:25:15 +0200132 if sys.maxunicode > 65535 and \
133 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
134 uni2 = _decode_uXXXX(s, end + 1)
135 if 0xdc00 <= uni2 <= 0xdfff:
136 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
137 end += 6
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000138 char = unichr(uni)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000139 # Append the unescaped character
140 _append(char)
Brett Cannon4b964f92008-05-05 20:21:38 +0000141 return u''.join(chunks), end
142
143
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000144# Use speedup if available
145scanstring = c_scanstring or py_scanstring
Brett Cannon4b964f92008-05-05 20:21:38 +0000146
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000147WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
148WHITESPACE_STR = ' \t\n\r'
Brett Cannon4b964f92008-05-05 20:21:38 +0000149
Ezio Melottiffd84962010-01-26 15:57:21 +0000150def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000151 object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Ezio Melottiffd84962010-01-26 15:57:21 +0000152 s, end = s_and_end
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000153 pairs = []
154 pairs_append = pairs.append
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000155 # Use a slice to prevent IndexError from being raised, the following
156 # check will raise a more specific ValueError if the string is empty
Brett Cannon4b964f92008-05-05 20:21:38 +0000157 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000158 # Normally we expect nextchar == '"'
Brett Cannon4b964f92008-05-05 20:21:38 +0000159 if nextchar != '"':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000160 if nextchar in _ws:
161 end = _w(s, end).end()
162 nextchar = s[end:end + 1]
163 # Trivial empty object
164 if nextchar == '}':
Ezio Melotti2b96f092011-04-13 05:37:29 +0300165 if object_pairs_hook is not None:
166 result = object_pairs_hook(pairs)
Ezio Melottifda7a8c2013-03-13 01:49:57 +0200167 return result, end + 1
Ezio Melotti2b96f092011-04-13 05:37:29 +0300168 pairs = {}
169 if object_hook is not None:
170 pairs = object_hook(pairs)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000171 return pairs, end + 1
172 elif nextchar != '"':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200173 raise ValueError(errmsg(
174 "Expecting property name enclosed in double quotes", s, end))
Brett Cannon4b964f92008-05-05 20:21:38 +0000175 end += 1
Brett Cannon4b964f92008-05-05 20:21:38 +0000176 while True:
177 key, end = scanstring(s, end, encoding, strict)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000178
179 # To skip some function call overhead we optimize the fast paths where
180 # the JSON key separator is ": " or just ":".
Brett Cannon4b964f92008-05-05 20:21:38 +0000181 if s[end:end + 1] != ':':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000182 end = _w(s, end).end()
183 if s[end:end + 1] != ':':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200184 raise ValueError(errmsg("Expecting ':' delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000185 end += 1
186
Brett Cannon4b964f92008-05-05 20:21:38 +0000187 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000188 if s[end] in _ws:
189 end += 1
190 if s[end] in _ws:
191 end = _w(s, end + 1).end()
192 except IndexError:
193 pass
194
195 try:
196 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000197 except StopIteration:
198 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000199 pairs_append((key, value))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000200
201 try:
202 nextchar = s[end]
203 if nextchar in _ws:
204 end = _w(s, end + 1).end()
205 nextchar = s[end]
206 except IndexError:
207 nextchar = ''
Brett Cannon4b964f92008-05-05 20:21:38 +0000208 end += 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000209
Brett Cannon4b964f92008-05-05 20:21:38 +0000210 if nextchar == '}':
211 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000212 elif nextchar != ',':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200213 raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000214
215 try:
216 nextchar = s[end]
217 if nextchar in _ws:
218 end += 1
219 nextchar = s[end]
220 if nextchar in _ws:
221 end = _w(s, end + 1).end()
222 nextchar = s[end]
223 except IndexError:
224 nextchar = ''
225
Brett Cannon4b964f92008-05-05 20:21:38 +0000226 end += 1
227 if nextchar != '"':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200228 raise ValueError(errmsg(
229 "Expecting property name enclosed in double quotes", s, end - 1))
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000230 if object_pairs_hook is not None:
231 result = object_pairs_hook(pairs)
232 return result, end
233 pairs = dict(pairs)
Brett Cannon4b964f92008-05-05 20:21:38 +0000234 if object_hook is not None:
235 pairs = object_hook(pairs)
236 return pairs, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000237
Ezio Melottiffd84962010-01-26 15:57:21 +0000238def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
239 s, end = s_and_end
Brett Cannon4b964f92008-05-05 20:21:38 +0000240 values = []
Brett Cannon4b964f92008-05-05 20:21:38 +0000241 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000242 if nextchar in _ws:
243 end = _w(s, end + 1).end()
244 nextchar = s[end:end + 1]
245 # Look-ahead for trivial empty array
Brett Cannon4b964f92008-05-05 20:21:38 +0000246 if nextchar == ']':
247 return values, end + 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000248 _append = values.append
Brett Cannon4b964f92008-05-05 20:21:38 +0000249 while True:
250 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000251 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000252 except StopIteration:
253 raise ValueError(errmsg("Expecting object", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000254 _append(value)
Brett Cannon4b964f92008-05-05 20:21:38 +0000255 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000256 if nextchar in _ws:
257 end = _w(s, end + 1).end()
258 nextchar = s[end:end + 1]
Brett Cannon4b964f92008-05-05 20:21:38 +0000259 end += 1
260 if nextchar == ']':
261 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000262 elif nextchar != ',':
Antoine Pitroud9a51372012-06-29 01:58:26 +0200263 raise ValueError(errmsg("Expecting ',' delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000264 try:
265 if s[end] in _ws:
266 end += 1
267 if s[end] in _ws:
268 end = _w(s, end + 1).end()
269 except IndexError:
270 pass
271
Brett Cannon4b964f92008-05-05 20:21:38 +0000272 return values, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000273
274class JSONDecoder(object):
275 """Simple JSON <http://json.org> decoder
276
277 Performs the following translations in decoding by default:
278
279 +---------------+-------------------+
280 | JSON | Python |
281 +===============+===================+
282 | object | dict |
283 +---------------+-------------------+
284 | array | list |
285 +---------------+-------------------+
286 | string | unicode |
287 +---------------+-------------------+
288 | number (int) | int, long |
289 +---------------+-------------------+
290 | number (real) | float |
291 +---------------+-------------------+
292 | true | True |
293 +---------------+-------------------+
294 | false | False |
295 +---------------+-------------------+
296 | null | None |
297 +---------------+-------------------+
298
299 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
300 their corresponding ``float`` values, which is outside the JSON spec.
Brett Cannon4b964f92008-05-05 20:21:38 +0000301
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000302 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000303
304 def __init__(self, encoding=None, object_hook=None, parse_float=None,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000305 parse_int=None, parse_constant=None, strict=True,
306 object_pairs_hook=None):
Brett Cannon4b964f92008-05-05 20:21:38 +0000307 """``encoding`` determines the encoding used to interpret any ``str``
308 objects decoded by this instance (utf-8 by default). It has no
309 effect when decoding ``unicode`` objects.
310
311 Note that currently only encodings that are a superset of ASCII work,
312 strings of other encodings should be passed in as ``unicode``.
313
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000314 ``object_hook``, if specified, will be called with the result
315 of every JSON object decoded and its return value will be used in
Brett Cannon4b964f92008-05-05 20:21:38 +0000316 place of the given ``dict``. This can be used to provide custom
317 deserializations (e.g. to support JSON-RPC class hinting).
318
Georg Brandldb949b82010-10-15 17:04:45 +0000319 ``object_pairs_hook``, if specified will be called with the result of
320 every JSON object decoded with an ordered list of pairs. The return
321 value of ``object_pairs_hook`` will be used instead of the ``dict``.
322 This feature can be used to implement custom decoders that rely on the
323 order that the key and value pairs are decoded (for example,
324 collections.OrderedDict will remember the order of insertion). If
325 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
326 priority.
327
Brett Cannon4b964f92008-05-05 20:21:38 +0000328 ``parse_float``, if specified, will be called with the string
329 of every JSON float to be decoded. By default this is equivalent to
330 float(num_str). This can be used to use another datatype or parser
331 for JSON floats (e.g. decimal.Decimal).
332
333 ``parse_int``, if specified, will be called with the string
334 of every JSON int to be decoded. By default this is equivalent to
335 int(num_str). This can be used to use another datatype or parser
336 for JSON integers (e.g. float).
337
338 ``parse_constant``, if specified, will be called with one of the
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000339 following strings: -Infinity, Infinity, NaN.
Brett Cannon4b964f92008-05-05 20:21:38 +0000340 This can be used to raise an exception if invalid JSON numbers
341 are encountered.
342
Georg Brandldb949b82010-10-15 17:04:45 +0000343 If ``strict`` is false (true is the default), then control
344 characters will be allowed inside strings. Control characters in
345 this context are those with character codes in the 0-31 range,
346 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
347
Brett Cannon4b964f92008-05-05 20:21:38 +0000348 """
349 self.encoding = encoding
350 self.object_hook = object_hook
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000351 self.object_pairs_hook = object_pairs_hook
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000352 self.parse_float = parse_float or float
353 self.parse_int = parse_int or int
354 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Brett Cannon4b964f92008-05-05 20:21:38 +0000355 self.strict = strict
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000356 self.parse_object = JSONObject
357 self.parse_array = JSONArray
358 self.parse_string = scanstring
Ezio Melottie3992eb2011-05-14 06:24:53 +0300359 self.scan_once = scanner.make_scanner(self)
Brett Cannon4b964f92008-05-05 20:21:38 +0000360
361 def decode(self, s, _w=WHITESPACE.match):
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000362 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
Brett Cannon4b964f92008-05-05 20:21:38 +0000363 instance containing a JSON document)
364
365 """
366 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
367 end = _w(s, end).end()
368 if end != len(s):
369 raise ValueError(errmsg("Extra data", s, end, len(s)))
370 return obj
371
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000372 def raw_decode(self, s, idx=0):
373 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
374 beginning with a JSON document) and return a 2-tuple of the Python
Brett Cannon4b964f92008-05-05 20:21:38 +0000375 representation and the index in ``s`` where the document ended.
376
377 This can be used to decode a JSON document from a string that may
378 have extraneous data at the end.
379
380 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000381 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000382 obj, end = self.scan_once(s, idx)
Brett Cannon4b964f92008-05-05 20:21:38 +0000383 except StopIteration:
384 raise ValueError("No JSON object could be decoded")
385 return obj, end