blob: b9745f788442b378e9377a9ad5ab67e6914293bd [file] [log] [blame]
Brett Cannon4b964f92008-05-05 20:21:38 +00001"""Implementation of JSONDecoder
2"""
Brett Cannon4b964f92008-05-05 20:21:38 +00003import re
4import sys
Bob Ippolitod914e3f2009-03-17 23:19:00 +00005import struct
Brett Cannon4b964f92008-05-05 20:21:38 +00006
Bob Ippolitod914e3f2009-03-17 23:19:00 +00007from json.scanner import make_scanner
Brett Cannon4b964f92008-05-05 20:21:38 +00008try:
9 from _json import scanstring as c_scanstring
10except ImportError:
11 c_scanstring = None
12
13__all__ = ['JSONDecoder']
14
15FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
16
Bob Ippolitod914e3f2009-03-17 23:19:00 +000017def _floatconstants():
18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
19 if sys.byteorder != 'big':
20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
21 nan, inf = struct.unpack('dd', _BYTES)
22 return nan, inf, -inf
23
24NaN, PosInf, NegInf = _floatconstants()
Brett Cannon4b964f92008-05-05 20:21:38 +000025
26
27def linecol(doc, pos):
28 lineno = doc.count('\n', 0, pos) + 1
29 if lineno == 1:
30 colno = pos
31 else:
32 colno = pos - doc.rindex('\n', 0, pos)
33 return lineno, colno
34
35
36def errmsg(msg, doc, pos, end=None):
Bob Ippolitod914e3f2009-03-17 23:19:00 +000037 # Note that this function is called from _json
Brett Cannon4b964f92008-05-05 20:21:38 +000038 lineno, colno = linecol(doc, pos)
39 if end is None:
40 fmt = '{0}: line {1} column {2} (char {3})'
41 return fmt.format(msg, lineno, colno, pos)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000042 #fmt = '%s: line %d column %d (char %d)'
43 #return fmt % (msg, lineno, colno, pos)
Brett Cannon4b964f92008-05-05 20:21:38 +000044 endlineno, endcolno = linecol(doc, end)
45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000047 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Brett Cannon4b964f92008-05-05 20:21:38 +000049
50
51_CONSTANTS = {
52 '-Infinity': NegInf,
53 'Infinity': PosInf,
54 'NaN': NaN,
Brett Cannon4b964f92008-05-05 20:21:38 +000055}
56
Brett Cannon4b964f92008-05-05 20:21:38 +000057STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
58BACKSLASH = {
59 '"': u'"', '\\': u'\\', '/': u'/',
60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
61}
62
63DEFAULT_ENCODING = "utf-8"
64
Bob Ippolitod914e3f2009-03-17 23:19:00 +000065def py_scanstring(s, end, encoding=None, strict=True,
66 _b=BACKSLASH, _m=STRINGCHUNK.match):
67 """Scan the string s for a JSON string. End is the index of the
68 character in s after the quote that started the JSON string.
69 Unescapes all valid JSON string escape sequences and raises ValueError
70 on attempt to decode an invalid string. If strict is False then literal
71 control characters are allowed in the string.
Brett Cannon4b964f92008-05-05 20:21:38 +000072
Bob Ippolitod914e3f2009-03-17 23:19:00 +000073 Returns a tuple of the decoded string and the index of the character in s
74 after the end quote."""
Brett Cannon4b964f92008-05-05 20:21:38 +000075 if encoding is None:
76 encoding = DEFAULT_ENCODING
77 chunks = []
78 _append = chunks.append
79 begin = end - 1
80 while 1:
81 chunk = _m(s, end)
82 if chunk is None:
83 raise ValueError(
84 errmsg("Unterminated string starting at", s, begin))
85 end = chunk.end()
86 content, terminator = chunk.groups()
Bob Ippolitod914e3f2009-03-17 23:19:00 +000087 # Content is contains zero or more unescaped string characters
Brett Cannon4b964f92008-05-05 20:21:38 +000088 if content:
89 if not isinstance(content, unicode):
90 content = unicode(content, encoding)
91 _append(content)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Brett Cannon4b964f92008-05-05 20:21:38 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Bob Ippolitod914e3f2009-03-17 23:19:00 +000098 #msg = "Invalid control character %r at" % (terminator,)
Brett Cannon4b964f92008-05-05 20:21:38 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000109 # If not a unicode escape sequence, must be in the lookup table
Brett Cannon4b964f92008-05-05 20:21:38 +0000110 if esc != 'u':
111 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000112 char = _b[esc]
Brett Cannon4b964f92008-05-05 20:21:38 +0000113 except KeyError:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000114 msg = "Invalid \\escape: " + repr(esc)
Brett Cannon4b964f92008-05-05 20:21:38 +0000115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000118 # Unicode escape sequence
Brett Cannon4b964f92008-05-05 20:21:38 +0000119 esc = s[end + 1:end + 5]
120 next_end = end + 5
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000121 if len(esc) != 4:
122 msg = "Invalid \\uXXXX escape"
Brett Cannon4b964f92008-05-05 20:21:38 +0000123 raise ValueError(errmsg(msg, s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000124 uni = int(esc, 16)
125 # Check for surrogate pair on UCS-4 systems
126 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
127 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
128 if not s[end + 5:end + 7] == '\\u':
129 raise ValueError(errmsg(msg, s, end))
130 esc2 = s[end + 7:end + 11]
131 if len(esc2) != 4:
132 raise ValueError(errmsg(msg, s, end))
133 uni2 = int(esc2, 16)
134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
135 next_end += 6
136 char = unichr(uni)
Brett Cannon4b964f92008-05-05 20:21:38 +0000137 end = next_end
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000138 # Append the unescaped character
139 _append(char)
Brett Cannon4b964f92008-05-05 20:21:38 +0000140 return u''.join(chunks), end
141
142
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000143# Use speedup if available
144scanstring = c_scanstring or py_scanstring
Brett Cannon4b964f92008-05-05 20:21:38 +0000145
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000146WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
147WHITESPACE_STR = ' \t\n\r'
Brett Cannon4b964f92008-05-05 20:21:38 +0000148
Ezio Melottiffd84962010-01-26 15:57:21 +0000149def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000150 object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Ezio Melottiffd84962010-01-26 15:57:21 +0000151 s, end = s_and_end
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000152 pairs = []
153 pairs_append = pairs.append
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000154 # Use a slice to prevent IndexError from being raised, the following
155 # check will raise a more specific ValueError if the string is empty
Brett Cannon4b964f92008-05-05 20:21:38 +0000156 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000157 # Normally we expect nextchar == '"'
Brett Cannon4b964f92008-05-05 20:21:38 +0000158 if nextchar != '"':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000159 if nextchar in _ws:
160 end = _w(s, end).end()
161 nextchar = s[end:end + 1]
162 # Trivial empty object
163 if nextchar == '}':
164 return pairs, end + 1
165 elif nextchar != '"':
166 raise ValueError(errmsg("Expecting property name", s, end))
Brett Cannon4b964f92008-05-05 20:21:38 +0000167 end += 1
Brett Cannon4b964f92008-05-05 20:21:38 +0000168 while True:
169 key, end = scanstring(s, end, encoding, strict)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000170
171 # To skip some function call overhead we optimize the fast paths where
172 # the JSON key separator is ": " or just ":".
Brett Cannon4b964f92008-05-05 20:21:38 +0000173 if s[end:end + 1] != ':':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000174 end = _w(s, end).end()
175 if s[end:end + 1] != ':':
176 raise ValueError(errmsg("Expecting : delimiter", s, end))
177
178 end += 1
179
Brett Cannon4b964f92008-05-05 20:21:38 +0000180 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000181 if s[end] in _ws:
182 end += 1
183 if s[end] in _ws:
184 end = _w(s, end + 1).end()
185 except IndexError:
186 pass
187
188 try:
189 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000190 except StopIteration:
191 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000192 pairs_append((key, value))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000193
194 try:
195 nextchar = s[end]
196 if nextchar in _ws:
197 end = _w(s, end + 1).end()
198 nextchar = s[end]
199 except IndexError:
200 nextchar = ''
Brett Cannon4b964f92008-05-05 20:21:38 +0000201 end += 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000202
Brett Cannon4b964f92008-05-05 20:21:38 +0000203 if nextchar == '}':
204 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000205 elif nextchar != ',':
Brett Cannon4b964f92008-05-05 20:21:38 +0000206 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000207
208 try:
209 nextchar = s[end]
210 if nextchar in _ws:
211 end += 1
212 nextchar = s[end]
213 if nextchar in _ws:
214 end = _w(s, end + 1).end()
215 nextchar = s[end]
216 except IndexError:
217 nextchar = ''
218
Brett Cannon4b964f92008-05-05 20:21:38 +0000219 end += 1
220 if nextchar != '"':
221 raise ValueError(errmsg("Expecting property name", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000222
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000223 if object_pairs_hook is not None:
224 result = object_pairs_hook(pairs)
225 return result, end
226 pairs = dict(pairs)
Brett Cannon4b964f92008-05-05 20:21:38 +0000227 if object_hook is not None:
228 pairs = object_hook(pairs)
229 return pairs, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000230
Ezio Melottiffd84962010-01-26 15:57:21 +0000231def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
232 s, end = s_and_end
Brett Cannon4b964f92008-05-05 20:21:38 +0000233 values = []
Brett Cannon4b964f92008-05-05 20:21:38 +0000234 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000235 if nextchar in _ws:
236 end = _w(s, end + 1).end()
237 nextchar = s[end:end + 1]
238 # Look-ahead for trivial empty array
Brett Cannon4b964f92008-05-05 20:21:38 +0000239 if nextchar == ']':
240 return values, end + 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000241 _append = values.append
Brett Cannon4b964f92008-05-05 20:21:38 +0000242 while True:
243 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000244 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000245 except StopIteration:
246 raise ValueError(errmsg("Expecting object", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000247 _append(value)
Brett Cannon4b964f92008-05-05 20:21:38 +0000248 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000249 if nextchar in _ws:
250 end = _w(s, end + 1).end()
251 nextchar = s[end:end + 1]
Brett Cannon4b964f92008-05-05 20:21:38 +0000252 end += 1
253 if nextchar == ']':
254 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000255 elif nextchar != ',':
Brett Cannon4b964f92008-05-05 20:21:38 +0000256 raise ValueError(errmsg("Expecting , delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000257
258 try:
259 if s[end] in _ws:
260 end += 1
261 if s[end] in _ws:
262 end = _w(s, end + 1).end()
263 except IndexError:
264 pass
265
Brett Cannon4b964f92008-05-05 20:21:38 +0000266 return values, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000267
268class JSONDecoder(object):
269 """Simple JSON <http://json.org> decoder
270
271 Performs the following translations in decoding by default:
272
273 +---------------+-------------------+
274 | JSON | Python |
275 +===============+===================+
276 | object | dict |
277 +---------------+-------------------+
278 | array | list |
279 +---------------+-------------------+
280 | string | unicode |
281 +---------------+-------------------+
282 | number (int) | int, long |
283 +---------------+-------------------+
284 | number (real) | float |
285 +---------------+-------------------+
286 | true | True |
287 +---------------+-------------------+
288 | false | False |
289 +---------------+-------------------+
290 | null | None |
291 +---------------+-------------------+
292
293 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
294 their corresponding ``float`` values, which is outside the JSON spec.
Brett Cannon4b964f92008-05-05 20:21:38 +0000295
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000296 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000297
298 def __init__(self, encoding=None, object_hook=None, parse_float=None,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000299 parse_int=None, parse_constant=None, strict=True,
300 object_pairs_hook=None):
Brett Cannon4b964f92008-05-05 20:21:38 +0000301 """``encoding`` determines the encoding used to interpret any ``str``
302 objects decoded by this instance (utf-8 by default). It has no
303 effect when decoding ``unicode`` objects.
304
305 Note that currently only encodings that are a superset of ASCII work,
306 strings of other encodings should be passed in as ``unicode``.
307
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000308 ``object_hook``, if specified, will be called with the result
309 of every JSON object decoded and its return value will be used in
Brett Cannon4b964f92008-05-05 20:21:38 +0000310 place of the given ``dict``. This can be used to provide custom
311 deserializations (e.g. to support JSON-RPC class hinting).
312
Georg Brandldb949b82010-10-15 17:04:45 +0000313 ``object_pairs_hook``, if specified will be called with the result of
314 every JSON object decoded with an ordered list of pairs. The return
315 value of ``object_pairs_hook`` will be used instead of the ``dict``.
316 This feature can be used to implement custom decoders that rely on the
317 order that the key and value pairs are decoded (for example,
318 collections.OrderedDict will remember the order of insertion). If
319 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
320 priority.
321
Brett Cannon4b964f92008-05-05 20:21:38 +0000322 ``parse_float``, if specified, will be called with the string
323 of every JSON float to be decoded. By default this is equivalent to
324 float(num_str). This can be used to use another datatype or parser
325 for JSON floats (e.g. decimal.Decimal).
326
327 ``parse_int``, if specified, will be called with the string
328 of every JSON int to be decoded. By default this is equivalent to
329 int(num_str). This can be used to use another datatype or parser
330 for JSON integers (e.g. float).
331
332 ``parse_constant``, if specified, will be called with one of the
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000333 following strings: -Infinity, Infinity, NaN.
Brett Cannon4b964f92008-05-05 20:21:38 +0000334 This can be used to raise an exception if invalid JSON numbers
335 are encountered.
336
Georg Brandldb949b82010-10-15 17:04:45 +0000337 If ``strict`` is false (true is the default), then control
338 characters will be allowed inside strings. Control characters in
339 this context are those with character codes in the 0-31 range,
340 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
341
Brett Cannon4b964f92008-05-05 20:21:38 +0000342 """
343 self.encoding = encoding
344 self.object_hook = object_hook
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000345 self.object_pairs_hook = object_pairs_hook
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000346 self.parse_float = parse_float or float
347 self.parse_int = parse_int or int
348 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Brett Cannon4b964f92008-05-05 20:21:38 +0000349 self.strict = strict
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000350 self.parse_object = JSONObject
351 self.parse_array = JSONArray
352 self.parse_string = scanstring
353 self.scan_once = make_scanner(self)
Brett Cannon4b964f92008-05-05 20:21:38 +0000354
355 def decode(self, s, _w=WHITESPACE.match):
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000356 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
Brett Cannon4b964f92008-05-05 20:21:38 +0000357 instance containing a JSON document)
358
359 """
360 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
361 end = _w(s, end).end()
362 if end != len(s):
363 raise ValueError(errmsg("Extra data", s, end, len(s)))
364 return obj
365
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000366 def raw_decode(self, s, idx=0):
367 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
368 beginning with a JSON document) and return a 2-tuple of the Python
Brett Cannon4b964f92008-05-05 20:21:38 +0000369 representation and the index in ``s`` where the document ended.
370
371 This can be used to decode a JSON document from a string that may
372 have extraneous data at the end.
373
374 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000375 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000376 obj, end = self.scan_once(s, idx)
Brett Cannon4b964f92008-05-05 20:21:38 +0000377 except StopIteration:
378 raise ValueError("No JSON object could be decoded")
379 return obj, end