blob: 73236ed4de97ac562c09e85bc29f089af3c94ca6 [file] [log] [blame]
Brett Cannon4b964f92008-05-05 20:21:38 +00001"""Implementation of JSONDecoder
2"""
Brett Cannon4b964f92008-05-05 20:21:38 +00003import re
4import sys
Bob Ippolitod914e3f2009-03-17 23:19:00 +00005import struct
Brett Cannon4b964f92008-05-05 20:21:38 +00006
Bob Ippolitod914e3f2009-03-17 23:19:00 +00007from json.scanner import make_scanner
Brett Cannon4b964f92008-05-05 20:21:38 +00008try:
9 from _json import scanstring as c_scanstring
10except ImportError:
11 c_scanstring = None
12
13__all__ = ['JSONDecoder']
14
15FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
16
Bob Ippolitod914e3f2009-03-17 23:19:00 +000017def _floatconstants():
18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
19 if sys.byteorder != 'big':
20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
21 nan, inf = struct.unpack('dd', _BYTES)
22 return nan, inf, -inf
23
24NaN, PosInf, NegInf = _floatconstants()
Brett Cannon4b964f92008-05-05 20:21:38 +000025
26
27def linecol(doc, pos):
28 lineno = doc.count('\n', 0, pos) + 1
29 if lineno == 1:
30 colno = pos
31 else:
32 colno = pos - doc.rindex('\n', 0, pos)
33 return lineno, colno
34
35
36def errmsg(msg, doc, pos, end=None):
Bob Ippolitod914e3f2009-03-17 23:19:00 +000037 # Note that this function is called from _json
Brett Cannon4b964f92008-05-05 20:21:38 +000038 lineno, colno = linecol(doc, pos)
39 if end is None:
40 fmt = '{0}: line {1} column {2} (char {3})'
41 return fmt.format(msg, lineno, colno, pos)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000042 #fmt = '%s: line %d column %d (char %d)'
43 #return fmt % (msg, lineno, colno, pos)
Brett Cannon4b964f92008-05-05 20:21:38 +000044 endlineno, endcolno = linecol(doc, end)
45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000047 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Brett Cannon4b964f92008-05-05 20:21:38 +000049
50
51_CONSTANTS = {
52 '-Infinity': NegInf,
53 'Infinity': PosInf,
54 'NaN': NaN,
Brett Cannon4b964f92008-05-05 20:21:38 +000055}
56
Brett Cannon4b964f92008-05-05 20:21:38 +000057STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
58BACKSLASH = {
59 '"': u'"', '\\': u'\\', '/': u'/',
60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
61}
62
63DEFAULT_ENCODING = "utf-8"
64
Bob Ippolitod914e3f2009-03-17 23:19:00 +000065def py_scanstring(s, end, encoding=None, strict=True,
66 _b=BACKSLASH, _m=STRINGCHUNK.match):
67 """Scan the string s for a JSON string. End is the index of the
68 character in s after the quote that started the JSON string.
69 Unescapes all valid JSON string escape sequences and raises ValueError
70 on attempt to decode an invalid string. If strict is False then literal
71 control characters are allowed in the string.
Brett Cannon4b964f92008-05-05 20:21:38 +000072
Bob Ippolitod914e3f2009-03-17 23:19:00 +000073 Returns a tuple of the decoded string and the index of the character in s
74 after the end quote."""
Brett Cannon4b964f92008-05-05 20:21:38 +000075 if encoding is None:
76 encoding = DEFAULT_ENCODING
77 chunks = []
78 _append = chunks.append
79 begin = end - 1
80 while 1:
81 chunk = _m(s, end)
82 if chunk is None:
83 raise ValueError(
84 errmsg("Unterminated string starting at", s, begin))
85 end = chunk.end()
86 content, terminator = chunk.groups()
Bob Ippolitod914e3f2009-03-17 23:19:00 +000087 # Content is contains zero or more unescaped string characters
Brett Cannon4b964f92008-05-05 20:21:38 +000088 if content:
89 if not isinstance(content, unicode):
90 content = unicode(content, encoding)
91 _append(content)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Brett Cannon4b964f92008-05-05 20:21:38 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Bob Ippolitod914e3f2009-03-17 23:19:00 +000098 #msg = "Invalid control character %r at" % (terminator,)
Brett Cannon4b964f92008-05-05 20:21:38 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000109 # If not a unicode escape sequence, must be in the lookup table
Brett Cannon4b964f92008-05-05 20:21:38 +0000110 if esc != 'u':
111 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000112 char = _b[esc]
Brett Cannon4b964f92008-05-05 20:21:38 +0000113 except KeyError:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000114 msg = "Invalid \\escape: " + repr(esc)
Brett Cannon4b964f92008-05-05 20:21:38 +0000115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000118 # Unicode escape sequence
Brett Cannon4b964f92008-05-05 20:21:38 +0000119 esc = s[end + 1:end + 5]
120 next_end = end + 5
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000121 if len(esc) != 4:
122 msg = "Invalid \\uXXXX escape"
Brett Cannon4b964f92008-05-05 20:21:38 +0000123 raise ValueError(errmsg(msg, s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000124 uni = int(esc, 16)
125 # Check for surrogate pair on UCS-4 systems
126 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
127 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
128 if not s[end + 5:end + 7] == '\\u':
129 raise ValueError(errmsg(msg, s, end))
130 esc2 = s[end + 7:end + 11]
131 if len(esc2) != 4:
132 raise ValueError(errmsg(msg, s, end))
133 uni2 = int(esc2, 16)
134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
135 next_end += 6
136 char = unichr(uni)
Brett Cannon4b964f92008-05-05 20:21:38 +0000137 end = next_end
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000138 # Append the unescaped character
139 _append(char)
Brett Cannon4b964f92008-05-05 20:21:38 +0000140 return u''.join(chunks), end
141
142
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000143# Use speedup if available
144scanstring = c_scanstring or py_scanstring
Brett Cannon4b964f92008-05-05 20:21:38 +0000145
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000146WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
147WHITESPACE_STR = ' \t\n\r'
Brett Cannon4b964f92008-05-05 20:21:38 +0000148
Ezio Melottiffd84962010-01-26 15:57:21 +0000149def JSONObject(s_and_end, encoding, strict, scan_once, object_hook,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000150 object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Ezio Melottiffd84962010-01-26 15:57:21 +0000151 s, end = s_and_end
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000152 pairs = []
153 pairs_append = pairs.append
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000154 # Use a slice to prevent IndexError from being raised, the following
155 # check will raise a more specific ValueError if the string is empty
Brett Cannon4b964f92008-05-05 20:21:38 +0000156 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000157 # Normally we expect nextchar == '"'
Brett Cannon4b964f92008-05-05 20:21:38 +0000158 if nextchar != '"':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000159 if nextchar in _ws:
160 end = _w(s, end).end()
161 nextchar = s[end:end + 1]
162 # Trivial empty object
163 if nextchar == '}':
164 return pairs, end + 1
165 elif nextchar != '"':
166 raise ValueError(errmsg("Expecting property name", s, end))
Brett Cannon4b964f92008-05-05 20:21:38 +0000167 end += 1
Brett Cannon4b964f92008-05-05 20:21:38 +0000168 while True:
169 key, end = scanstring(s, end, encoding, strict)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000170
171 # To skip some function call overhead we optimize the fast paths where
172 # the JSON key separator is ": " or just ":".
Brett Cannon4b964f92008-05-05 20:21:38 +0000173 if s[end:end + 1] != ':':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000174 end = _w(s, end).end()
175 if s[end:end + 1] != ':':
176 raise ValueError(errmsg("Expecting : delimiter", s, end))
177
178 end += 1
179
Brett Cannon4b964f92008-05-05 20:21:38 +0000180 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000181 if s[end] in _ws:
182 end += 1
183 if s[end] in _ws:
184 end = _w(s, end + 1).end()
185 except IndexError:
186 pass
187
188 try:
189 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000190 except StopIteration:
191 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000192 pairs_append((key, value))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000193
194 try:
195 nextchar = s[end]
196 if nextchar in _ws:
197 end = _w(s, end + 1).end()
198 nextchar = s[end]
199 except IndexError:
200 nextchar = ''
Brett Cannon4b964f92008-05-05 20:21:38 +0000201 end += 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000202
Brett Cannon4b964f92008-05-05 20:21:38 +0000203 if nextchar == '}':
204 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000205 elif nextchar != ',':
Brett Cannon4b964f92008-05-05 20:21:38 +0000206 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000207
208 try:
209 nextchar = s[end]
210 if nextchar in _ws:
211 end += 1
212 nextchar = s[end]
213 if nextchar in _ws:
214 end = _w(s, end + 1).end()
215 nextchar = s[end]
216 except IndexError:
217 nextchar = ''
218
Brett Cannon4b964f92008-05-05 20:21:38 +0000219 end += 1
220 if nextchar != '"':
221 raise ValueError(errmsg("Expecting property name", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000222
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000223 if object_pairs_hook is not None:
224 result = object_pairs_hook(pairs)
225 return result, end
226 pairs = dict(pairs)
Brett Cannon4b964f92008-05-05 20:21:38 +0000227 if object_hook is not None:
228 pairs = object_hook(pairs)
229 return pairs, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000230
Ezio Melottiffd84962010-01-26 15:57:21 +0000231def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
232 s, end = s_and_end
Brett Cannon4b964f92008-05-05 20:21:38 +0000233 values = []
Brett Cannon4b964f92008-05-05 20:21:38 +0000234 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000235 if nextchar in _ws:
236 end = _w(s, end + 1).end()
237 nextchar = s[end:end + 1]
238 # Look-ahead for trivial empty array
Brett Cannon4b964f92008-05-05 20:21:38 +0000239 if nextchar == ']':
240 return values, end + 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000241 _append = values.append
Brett Cannon4b964f92008-05-05 20:21:38 +0000242 while True:
243 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000244 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000245 except StopIteration:
246 raise ValueError(errmsg("Expecting object", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000247 _append(value)
Brett Cannon4b964f92008-05-05 20:21:38 +0000248 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000249 if nextchar in _ws:
250 end = _w(s, end + 1).end()
251 nextchar = s[end:end + 1]
Brett Cannon4b964f92008-05-05 20:21:38 +0000252 end += 1
253 if nextchar == ']':
254 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000255 elif nextchar != ',':
Brett Cannon4b964f92008-05-05 20:21:38 +0000256 raise ValueError(errmsg("Expecting , delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000257
258 try:
259 if s[end] in _ws:
260 end += 1
261 if s[end] in _ws:
262 end = _w(s, end + 1).end()
263 except IndexError:
264 pass
265
Brett Cannon4b964f92008-05-05 20:21:38 +0000266 return values, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000267
268class JSONDecoder(object):
269 """Simple JSON <http://json.org> decoder
270
271 Performs the following translations in decoding by default:
272
273 +---------------+-------------------+
274 | JSON | Python |
275 +===============+===================+
276 | object | dict |
277 +---------------+-------------------+
278 | array | list |
279 +---------------+-------------------+
280 | string | unicode |
281 +---------------+-------------------+
282 | number (int) | int, long |
283 +---------------+-------------------+
284 | number (real) | float |
285 +---------------+-------------------+
286 | true | True |
287 +---------------+-------------------+
288 | false | False |
289 +---------------+-------------------+
290 | null | None |
291 +---------------+-------------------+
292
293 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
294 their corresponding ``float`` values, which is outside the JSON spec.
Brett Cannon4b964f92008-05-05 20:21:38 +0000295
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000296 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000297
298 def __init__(self, encoding=None, object_hook=None, parse_float=None,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000299 parse_int=None, parse_constant=None, strict=True,
300 object_pairs_hook=None):
Brett Cannon4b964f92008-05-05 20:21:38 +0000301 """``encoding`` determines the encoding used to interpret any ``str``
302 objects decoded by this instance (utf-8 by default). It has no
303 effect when decoding ``unicode`` objects.
304
305 Note that currently only encodings that are a superset of ASCII work,
306 strings of other encodings should be passed in as ``unicode``.
307
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000308 ``object_hook``, if specified, will be called with the result
309 of every JSON object decoded and its return value will be used in
Brett Cannon4b964f92008-05-05 20:21:38 +0000310 place of the given ``dict``. This can be used to provide custom
311 deserializations (e.g. to support JSON-RPC class hinting).
312
313 ``parse_float``, if specified, will be called with the string
314 of every JSON float to be decoded. By default this is equivalent to
315 float(num_str). This can be used to use another datatype or parser
316 for JSON floats (e.g. decimal.Decimal).
317
318 ``parse_int``, if specified, will be called with the string
319 of every JSON int to be decoded. By default this is equivalent to
320 int(num_str). This can be used to use another datatype or parser
321 for JSON integers (e.g. float).
322
323 ``parse_constant``, if specified, will be called with one of the
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000324 following strings: -Infinity, Infinity, NaN.
Brett Cannon4b964f92008-05-05 20:21:38 +0000325 This can be used to raise an exception if invalid JSON numbers
326 are encountered.
327
328 """
329 self.encoding = encoding
330 self.object_hook = object_hook
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000331 self.object_pairs_hook = object_pairs_hook
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000332 self.parse_float = parse_float or float
333 self.parse_int = parse_int or int
334 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Brett Cannon4b964f92008-05-05 20:21:38 +0000335 self.strict = strict
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000336 self.parse_object = JSONObject
337 self.parse_array = JSONArray
338 self.parse_string = scanstring
339 self.scan_once = make_scanner(self)
Brett Cannon4b964f92008-05-05 20:21:38 +0000340
341 def decode(self, s, _w=WHITESPACE.match):
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000342 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
Brett Cannon4b964f92008-05-05 20:21:38 +0000343 instance containing a JSON document)
344
345 """
346 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
347 end = _w(s, end).end()
348 if end != len(s):
349 raise ValueError(errmsg("Extra data", s, end, len(s)))
350 return obj
351
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000352 def raw_decode(self, s, idx=0):
353 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
354 beginning with a JSON document) and return a 2-tuple of the Python
Brett Cannon4b964f92008-05-05 20:21:38 +0000355 representation and the index in ``s`` where the document ended.
356
357 This can be used to decode a JSON document from a string that may
358 have extraneous data at the end.
359
360 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000361 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000362 obj, end = self.scan_once(s, idx)
Brett Cannon4b964f92008-05-05 20:21:38 +0000363 except StopIteration:
364 raise ValueError("No JSON object could be decoded")
365 return obj, end