blob: c7f04f9eec8e56d2c2b602b3d668f086d7aebdb9 [file] [log] [blame]
Brett Cannon4b964f92008-05-05 20:21:38 +00001"""Implementation of JSONDecoder
2"""
Brett Cannon4b964f92008-05-05 20:21:38 +00003import re
4import sys
Bob Ippolitod914e3f2009-03-17 23:19:00 +00005import struct
Brett Cannon4b964f92008-05-05 20:21:38 +00006
Bob Ippolitod914e3f2009-03-17 23:19:00 +00007from json.scanner import make_scanner
Brett Cannon4b964f92008-05-05 20:21:38 +00008try:
9 from _json import scanstring as c_scanstring
10except ImportError:
11 c_scanstring = None
12
13__all__ = ['JSONDecoder']
14
15FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
16
Bob Ippolitod914e3f2009-03-17 23:19:00 +000017def _floatconstants():
18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
19 if sys.byteorder != 'big':
20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
21 nan, inf = struct.unpack('dd', _BYTES)
22 return nan, inf, -inf
23
24NaN, PosInf, NegInf = _floatconstants()
Brett Cannon4b964f92008-05-05 20:21:38 +000025
26
27def linecol(doc, pos):
28 lineno = doc.count('\n', 0, pos) + 1
29 if lineno == 1:
30 colno = pos
31 else:
32 colno = pos - doc.rindex('\n', 0, pos)
33 return lineno, colno
34
35
36def errmsg(msg, doc, pos, end=None):
Bob Ippolitod914e3f2009-03-17 23:19:00 +000037 # Note that this function is called from _json
Brett Cannon4b964f92008-05-05 20:21:38 +000038 lineno, colno = linecol(doc, pos)
39 if end is None:
40 fmt = '{0}: line {1} column {2} (char {3})'
41 return fmt.format(msg, lineno, colno, pos)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000042 #fmt = '%s: line %d column %d (char %d)'
43 #return fmt % (msg, lineno, colno, pos)
Brett Cannon4b964f92008-05-05 20:21:38 +000044 endlineno, endcolno = linecol(doc, end)
45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000047 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Brett Cannon4b964f92008-05-05 20:21:38 +000049
50
51_CONSTANTS = {
52 '-Infinity': NegInf,
53 'Infinity': PosInf,
54 'NaN': NaN,
Brett Cannon4b964f92008-05-05 20:21:38 +000055}
56
Brett Cannon4b964f92008-05-05 20:21:38 +000057STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
58BACKSLASH = {
59 '"': u'"', '\\': u'\\', '/': u'/',
60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
61}
62
63DEFAULT_ENCODING = "utf-8"
64
Bob Ippolitod914e3f2009-03-17 23:19:00 +000065def py_scanstring(s, end, encoding=None, strict=True,
66 _b=BACKSLASH, _m=STRINGCHUNK.match):
67 """Scan the string s for a JSON string. End is the index of the
68 character in s after the quote that started the JSON string.
69 Unescapes all valid JSON string escape sequences and raises ValueError
70 on attempt to decode an invalid string. If strict is False then literal
71 control characters are allowed in the string.
Brett Cannon4b964f92008-05-05 20:21:38 +000072
Bob Ippolitod914e3f2009-03-17 23:19:00 +000073 Returns a tuple of the decoded string and the index of the character in s
74 after the end quote."""
Brett Cannon4b964f92008-05-05 20:21:38 +000075 if encoding is None:
76 encoding = DEFAULT_ENCODING
77 chunks = []
78 _append = chunks.append
79 begin = end - 1
80 while 1:
81 chunk = _m(s, end)
82 if chunk is None:
83 raise ValueError(
84 errmsg("Unterminated string starting at", s, begin))
85 end = chunk.end()
86 content, terminator = chunk.groups()
Bob Ippolitod914e3f2009-03-17 23:19:00 +000087 # Content is contains zero or more unescaped string characters
Brett Cannon4b964f92008-05-05 20:21:38 +000088 if content:
89 if not isinstance(content, unicode):
90 content = unicode(content, encoding)
91 _append(content)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Brett Cannon4b964f92008-05-05 20:21:38 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Bob Ippolitod914e3f2009-03-17 23:19:00 +000098 #msg = "Invalid control character %r at" % (terminator,)
Brett Cannon4b964f92008-05-05 20:21:38 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000109 # If not a unicode escape sequence, must be in the lookup table
Brett Cannon4b964f92008-05-05 20:21:38 +0000110 if esc != 'u':
111 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000112 char = _b[esc]
Brett Cannon4b964f92008-05-05 20:21:38 +0000113 except KeyError:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000114 msg = "Invalid \\escape: " + repr(esc)
Brett Cannon4b964f92008-05-05 20:21:38 +0000115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000118 # Unicode escape sequence
Brett Cannon4b964f92008-05-05 20:21:38 +0000119 esc = s[end + 1:end + 5]
120 next_end = end + 5
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000121 if len(esc) != 4:
122 msg = "Invalid \\uXXXX escape"
Brett Cannon4b964f92008-05-05 20:21:38 +0000123 raise ValueError(errmsg(msg, s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000124 uni = int(esc, 16)
125 # Check for surrogate pair on UCS-4 systems
126 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
127 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
128 if not s[end + 5:end + 7] == '\\u':
129 raise ValueError(errmsg(msg, s, end))
130 esc2 = s[end + 7:end + 11]
131 if len(esc2) != 4:
132 raise ValueError(errmsg(msg, s, end))
133 uni2 = int(esc2, 16)
134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
135 next_end += 6
136 char = unichr(uni)
Brett Cannon4b964f92008-05-05 20:21:38 +0000137 end = next_end
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000138 # Append the unescaped character
139 _append(char)
Brett Cannon4b964f92008-05-05 20:21:38 +0000140 return u''.join(chunks), end
141
142
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000143# Use speedup if available
144scanstring = c_scanstring or py_scanstring
Brett Cannon4b964f92008-05-05 20:21:38 +0000145
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000146WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
147WHITESPACE_STR = ' \t\n\r'
Brett Cannon4b964f92008-05-05 20:21:38 +0000148
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000149def JSONObject((s, end), encoding, strict, scan_once, object_hook,
150 _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Brett Cannon4b964f92008-05-05 20:21:38 +0000151 pairs = {}
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000152 # Use a slice to prevent IndexError from being raised, the following
153 # check will raise a more specific ValueError if the string is empty
Brett Cannon4b964f92008-05-05 20:21:38 +0000154 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000155 # Normally we expect nextchar == '"'
Brett Cannon4b964f92008-05-05 20:21:38 +0000156 if nextchar != '"':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000157 if nextchar in _ws:
158 end = _w(s, end).end()
159 nextchar = s[end:end + 1]
160 # Trivial empty object
161 if nextchar == '}':
162 return pairs, end + 1
163 elif nextchar != '"':
164 raise ValueError(errmsg("Expecting property name", s, end))
Brett Cannon4b964f92008-05-05 20:21:38 +0000165 end += 1
Brett Cannon4b964f92008-05-05 20:21:38 +0000166 while True:
167 key, end = scanstring(s, end, encoding, strict)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000168
169 # To skip some function call overhead we optimize the fast paths where
170 # the JSON key separator is ": " or just ":".
Brett Cannon4b964f92008-05-05 20:21:38 +0000171 if s[end:end + 1] != ':':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000172 end = _w(s, end).end()
173 if s[end:end + 1] != ':':
174 raise ValueError(errmsg("Expecting : delimiter", s, end))
175
176 end += 1
177
Brett Cannon4b964f92008-05-05 20:21:38 +0000178 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000179 if s[end] in _ws:
180 end += 1
181 if s[end] in _ws:
182 end = _w(s, end + 1).end()
183 except IndexError:
184 pass
185
186 try:
187 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000188 except StopIteration:
189 raise ValueError(errmsg("Expecting object", s, end))
190 pairs[key] = value
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000191
192 try:
193 nextchar = s[end]
194 if nextchar in _ws:
195 end = _w(s, end + 1).end()
196 nextchar = s[end]
197 except IndexError:
198 nextchar = ''
Brett Cannon4b964f92008-05-05 20:21:38 +0000199 end += 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000200
Brett Cannon4b964f92008-05-05 20:21:38 +0000201 if nextchar == '}':
202 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000203 elif nextchar != ',':
Brett Cannon4b964f92008-05-05 20:21:38 +0000204 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000205
206 try:
207 nextchar = s[end]
208 if nextchar in _ws:
209 end += 1
210 nextchar = s[end]
211 if nextchar in _ws:
212 end = _w(s, end + 1).end()
213 nextchar = s[end]
214 except IndexError:
215 nextchar = ''
216
Brett Cannon4b964f92008-05-05 20:21:38 +0000217 end += 1
218 if nextchar != '"':
219 raise ValueError(errmsg("Expecting property name", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000220
Brett Cannon4b964f92008-05-05 20:21:38 +0000221 if object_hook is not None:
222 pairs = object_hook(pairs)
223 return pairs, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000224
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000225def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Brett Cannon4b964f92008-05-05 20:21:38 +0000226 values = []
Brett Cannon4b964f92008-05-05 20:21:38 +0000227 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000228 if nextchar in _ws:
229 end = _w(s, end + 1).end()
230 nextchar = s[end:end + 1]
231 # Look-ahead for trivial empty array
Brett Cannon4b964f92008-05-05 20:21:38 +0000232 if nextchar == ']':
233 return values, end + 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000234 _append = values.append
Brett Cannon4b964f92008-05-05 20:21:38 +0000235 while True:
236 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000237 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000238 except StopIteration:
239 raise ValueError(errmsg("Expecting object", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000240 _append(value)
Brett Cannon4b964f92008-05-05 20:21:38 +0000241 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000242 if nextchar in _ws:
243 end = _w(s, end + 1).end()
244 nextchar = s[end:end + 1]
Brett Cannon4b964f92008-05-05 20:21:38 +0000245 end += 1
246 if nextchar == ']':
247 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000248 elif nextchar != ',':
Brett Cannon4b964f92008-05-05 20:21:38 +0000249 raise ValueError(errmsg("Expecting , delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000250
251 try:
252 if s[end] in _ws:
253 end += 1
254 if s[end] in _ws:
255 end = _w(s, end + 1).end()
256 except IndexError:
257 pass
258
Brett Cannon4b964f92008-05-05 20:21:38 +0000259 return values, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000260
261class JSONDecoder(object):
262 """Simple JSON <http://json.org> decoder
263
264 Performs the following translations in decoding by default:
265
266 +---------------+-------------------+
267 | JSON | Python |
268 +===============+===================+
269 | object | dict |
270 +---------------+-------------------+
271 | array | list |
272 +---------------+-------------------+
273 | string | unicode |
274 +---------------+-------------------+
275 | number (int) | int, long |
276 +---------------+-------------------+
277 | number (real) | float |
278 +---------------+-------------------+
279 | true | True |
280 +---------------+-------------------+
281 | false | False |
282 +---------------+-------------------+
283 | null | None |
284 +---------------+-------------------+
285
286 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
287 their corresponding ``float`` values, which is outside the JSON spec.
Brett Cannon4b964f92008-05-05 20:21:38 +0000288
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000289 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000290
291 def __init__(self, encoding=None, object_hook=None, parse_float=None,
292 parse_int=None, parse_constant=None, strict=True):
293 """``encoding`` determines the encoding used to interpret any ``str``
294 objects decoded by this instance (utf-8 by default). It has no
295 effect when decoding ``unicode`` objects.
296
297 Note that currently only encodings that are a superset of ASCII work,
298 strings of other encodings should be passed in as ``unicode``.
299
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000300 ``object_hook``, if specified, will be called with the result
301 of every JSON object decoded and its return value will be used in
Brett Cannon4b964f92008-05-05 20:21:38 +0000302 place of the given ``dict``. This can be used to provide custom
303 deserializations (e.g. to support JSON-RPC class hinting).
304
305 ``parse_float``, if specified, will be called with the string
306 of every JSON float to be decoded. By default this is equivalent to
307 float(num_str). This can be used to use another datatype or parser
308 for JSON floats (e.g. decimal.Decimal).
309
310 ``parse_int``, if specified, will be called with the string
311 of every JSON int to be decoded. By default this is equivalent to
312 int(num_str). This can be used to use another datatype or parser
313 for JSON integers (e.g. float).
314
315 ``parse_constant``, if specified, will be called with one of the
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000316 following strings: -Infinity, Infinity, NaN.
Brett Cannon4b964f92008-05-05 20:21:38 +0000317 This can be used to raise an exception if invalid JSON numbers
318 are encountered.
319
320 """
321 self.encoding = encoding
322 self.object_hook = object_hook
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000323 self.parse_float = parse_float or float
324 self.parse_int = parse_int or int
325 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Brett Cannon4b964f92008-05-05 20:21:38 +0000326 self.strict = strict
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000327 self.parse_object = JSONObject
328 self.parse_array = JSONArray
329 self.parse_string = scanstring
330 self.scan_once = make_scanner(self)
Brett Cannon4b964f92008-05-05 20:21:38 +0000331
332 def decode(self, s, _w=WHITESPACE.match):
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000333 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
Brett Cannon4b964f92008-05-05 20:21:38 +0000334 instance containing a JSON document)
335
336 """
337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
340 raise ValueError(errmsg("Extra data", s, end, len(s)))
341 return obj
342
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000343 def raw_decode(self, s, idx=0):
344 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
345 beginning with a JSON document) and return a 2-tuple of the Python
Brett Cannon4b964f92008-05-05 20:21:38 +0000346 representation and the index in ``s`` where the document ended.
347
348 This can be used to decode a JSON document from a string that may
349 have extraneous data at the end.
350
351 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000352 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000353 obj, end = self.scan_once(s, idx)
Brett Cannon4b964f92008-05-05 20:21:38 +0000354 except StopIteration:
355 raise ValueError("No JSON object could be decoded")
356 return obj, end