blob: 44635a00e74f61f9f49f24d1027522e58ae5285e [file] [log] [blame]
Brett Cannon4b964f92008-05-05 20:21:38 +00001"""Implementation of JSONDecoder
2"""
Brett Cannon4b964f92008-05-05 20:21:38 +00003import re
4import sys
Bob Ippolitod914e3f2009-03-17 23:19:00 +00005import struct
Brett Cannon4b964f92008-05-05 20:21:38 +00006
Bob Ippolitod914e3f2009-03-17 23:19:00 +00007from json.scanner import make_scanner
Brett Cannon4b964f92008-05-05 20:21:38 +00008try:
9 from _json import scanstring as c_scanstring
10except ImportError:
11 c_scanstring = None
12
13__all__ = ['JSONDecoder']
14
15FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
16
Bob Ippolitod914e3f2009-03-17 23:19:00 +000017def _floatconstants():
18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
19 if sys.byteorder != 'big':
20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
21 nan, inf = struct.unpack('dd', _BYTES)
22 return nan, inf, -inf
23
24NaN, PosInf, NegInf = _floatconstants()
Brett Cannon4b964f92008-05-05 20:21:38 +000025
26
27def linecol(doc, pos):
28 lineno = doc.count('\n', 0, pos) + 1
29 if lineno == 1:
30 colno = pos
31 else:
32 colno = pos - doc.rindex('\n', 0, pos)
33 return lineno, colno
34
35
36def errmsg(msg, doc, pos, end=None):
Bob Ippolitod914e3f2009-03-17 23:19:00 +000037 # Note that this function is called from _json
Brett Cannon4b964f92008-05-05 20:21:38 +000038 lineno, colno = linecol(doc, pos)
39 if end is None:
40 fmt = '{0}: line {1} column {2} (char {3})'
41 return fmt.format(msg, lineno, colno, pos)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000042 #fmt = '%s: line %d column %d (char %d)'
43 #return fmt % (msg, lineno, colno, pos)
Brett Cannon4b964f92008-05-05 20:21:38 +000044 endlineno, endcolno = linecol(doc, end)
45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000047 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Brett Cannon4b964f92008-05-05 20:21:38 +000049
50
51_CONSTANTS = {
52 '-Infinity': NegInf,
53 'Infinity': PosInf,
54 'NaN': NaN,
Brett Cannon4b964f92008-05-05 20:21:38 +000055}
56
Brett Cannon4b964f92008-05-05 20:21:38 +000057STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
58BACKSLASH = {
59 '"': u'"', '\\': u'\\', '/': u'/',
60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
61}
62
63DEFAULT_ENCODING = "utf-8"
64
Bob Ippolitod914e3f2009-03-17 23:19:00 +000065def py_scanstring(s, end, encoding=None, strict=True,
66 _b=BACKSLASH, _m=STRINGCHUNK.match):
67 """Scan the string s for a JSON string. End is the index of the
68 character in s after the quote that started the JSON string.
69 Unescapes all valid JSON string escape sequences and raises ValueError
70 on attempt to decode an invalid string. If strict is False then literal
71 control characters are allowed in the string.
Brett Cannon4b964f92008-05-05 20:21:38 +000072
Bob Ippolitod914e3f2009-03-17 23:19:00 +000073 Returns a tuple of the decoded string and the index of the character in s
74 after the end quote."""
Brett Cannon4b964f92008-05-05 20:21:38 +000075 if encoding is None:
76 encoding = DEFAULT_ENCODING
77 chunks = []
78 _append = chunks.append
79 begin = end - 1
80 while 1:
81 chunk = _m(s, end)
82 if chunk is None:
83 raise ValueError(
84 errmsg("Unterminated string starting at", s, begin))
85 end = chunk.end()
86 content, terminator = chunk.groups()
Bob Ippolitod914e3f2009-03-17 23:19:00 +000087 # Content is contains zero or more unescaped string characters
Brett Cannon4b964f92008-05-05 20:21:38 +000088 if content:
89 if not isinstance(content, unicode):
90 content = unicode(content, encoding)
91 _append(content)
Bob Ippolitod914e3f2009-03-17 23:19:00 +000092 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
Brett Cannon4b964f92008-05-05 20:21:38 +000094 if terminator == '"':
95 break
96 elif terminator != '\\':
97 if strict:
Bob Ippolitod914e3f2009-03-17 23:19:00 +000098 #msg = "Invalid control character %r at" % (terminator,)
Brett Cannon4b964f92008-05-05 20:21:38 +000099 msg = "Invalid control character {0!r} at".format(terminator)
100 raise ValueError(errmsg(msg, s, end))
101 else:
102 _append(terminator)
103 continue
104 try:
105 esc = s[end]
106 except IndexError:
107 raise ValueError(
108 errmsg("Unterminated string starting at", s, begin))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000109 # If not a unicode escape sequence, must be in the lookup table
Brett Cannon4b964f92008-05-05 20:21:38 +0000110 if esc != 'u':
111 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000112 char = _b[esc]
Brett Cannon4b964f92008-05-05 20:21:38 +0000113 except KeyError:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000114 msg = "Invalid \\escape: " + repr(esc)
Brett Cannon4b964f92008-05-05 20:21:38 +0000115 raise ValueError(errmsg(msg, s, end))
116 end += 1
117 else:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000118 # Unicode escape sequence
Brett Cannon4b964f92008-05-05 20:21:38 +0000119 esc = s[end + 1:end + 5]
120 next_end = end + 5
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000121 if len(esc) != 4:
122 msg = "Invalid \\uXXXX escape"
Brett Cannon4b964f92008-05-05 20:21:38 +0000123 raise ValueError(errmsg(msg, s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000124 uni = int(esc, 16)
125 # Check for surrogate pair on UCS-4 systems
126 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
127 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
128 if not s[end + 5:end + 7] == '\\u':
129 raise ValueError(errmsg(msg, s, end))
130 esc2 = s[end + 7:end + 11]
131 if len(esc2) != 4:
132 raise ValueError(errmsg(msg, s, end))
133 uni2 = int(esc2, 16)
134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
135 next_end += 6
136 char = unichr(uni)
Brett Cannon4b964f92008-05-05 20:21:38 +0000137 end = next_end
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000138 # Append the unescaped character
139 _append(char)
Brett Cannon4b964f92008-05-05 20:21:38 +0000140 return u''.join(chunks), end
141
142
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000143# Use speedup if available
144scanstring = c_scanstring or py_scanstring
Brett Cannon4b964f92008-05-05 20:21:38 +0000145
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000146WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
147WHITESPACE_STR = ' \t\n\r'
Brett Cannon4b964f92008-05-05 20:21:38 +0000148
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000149def JSONObject((s, end), encoding, strict, scan_once, object_hook,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000150 object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
151 pairs = []
152 pairs_append = pairs.append
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000153 # Use a slice to prevent IndexError from being raised, the following
154 # check will raise a more specific ValueError if the string is empty
Brett Cannon4b964f92008-05-05 20:21:38 +0000155 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000156 # Normally we expect nextchar == '"'
Brett Cannon4b964f92008-05-05 20:21:38 +0000157 if nextchar != '"':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000158 if nextchar in _ws:
159 end = _w(s, end).end()
160 nextchar = s[end:end + 1]
161 # Trivial empty object
162 if nextchar == '}':
163 return pairs, end + 1
164 elif nextchar != '"':
165 raise ValueError(errmsg("Expecting property name", s, end))
Brett Cannon4b964f92008-05-05 20:21:38 +0000166 end += 1
Brett Cannon4b964f92008-05-05 20:21:38 +0000167 while True:
168 key, end = scanstring(s, end, encoding, strict)
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000169
170 # To skip some function call overhead we optimize the fast paths where
171 # the JSON key separator is ": " or just ":".
Brett Cannon4b964f92008-05-05 20:21:38 +0000172 if s[end:end + 1] != ':':
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000173 end = _w(s, end).end()
174 if s[end:end + 1] != ':':
175 raise ValueError(errmsg("Expecting : delimiter", s, end))
176
177 end += 1
178
Brett Cannon4b964f92008-05-05 20:21:38 +0000179 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000180 if s[end] in _ws:
181 end += 1
182 if s[end] in _ws:
183 end = _w(s, end + 1).end()
184 except IndexError:
185 pass
186
187 try:
188 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000189 except StopIteration:
190 raise ValueError(errmsg("Expecting object", s, end))
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000191 pairs_append((key, value))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000192
193 try:
194 nextchar = s[end]
195 if nextchar in _ws:
196 end = _w(s, end + 1).end()
197 nextchar = s[end]
198 except IndexError:
199 nextchar = ''
Brett Cannon4b964f92008-05-05 20:21:38 +0000200 end += 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000201
Brett Cannon4b964f92008-05-05 20:21:38 +0000202 if nextchar == '}':
203 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000204 elif nextchar != ',':
Brett Cannon4b964f92008-05-05 20:21:38 +0000205 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000206
207 try:
208 nextchar = s[end]
209 if nextchar in _ws:
210 end += 1
211 nextchar = s[end]
212 if nextchar in _ws:
213 end = _w(s, end + 1).end()
214 nextchar = s[end]
215 except IndexError:
216 nextchar = ''
217
Brett Cannon4b964f92008-05-05 20:21:38 +0000218 end += 1
219 if nextchar != '"':
220 raise ValueError(errmsg("Expecting property name", s, end - 1))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000221
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000222 if object_pairs_hook is not None:
223 result = object_pairs_hook(pairs)
224 return result, end
225 pairs = dict(pairs)
Brett Cannon4b964f92008-05-05 20:21:38 +0000226 if object_hook is not None:
227 pairs = object_hook(pairs)
228 return pairs, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000229
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000230def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Brett Cannon4b964f92008-05-05 20:21:38 +0000231 values = []
Brett Cannon4b964f92008-05-05 20:21:38 +0000232 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000233 if nextchar in _ws:
234 end = _w(s, end + 1).end()
235 nextchar = s[end:end + 1]
236 # Look-ahead for trivial empty array
Brett Cannon4b964f92008-05-05 20:21:38 +0000237 if nextchar == ']':
238 return values, end + 1
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000239 _append = values.append
Brett Cannon4b964f92008-05-05 20:21:38 +0000240 while True:
241 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000242 value, end = scan_once(s, end)
Brett Cannon4b964f92008-05-05 20:21:38 +0000243 except StopIteration:
244 raise ValueError(errmsg("Expecting object", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000245 _append(value)
Brett Cannon4b964f92008-05-05 20:21:38 +0000246 nextchar = s[end:end + 1]
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000247 if nextchar in _ws:
248 end = _w(s, end + 1).end()
249 nextchar = s[end:end + 1]
Brett Cannon4b964f92008-05-05 20:21:38 +0000250 end += 1
251 if nextchar == ']':
252 break
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000253 elif nextchar != ',':
Brett Cannon4b964f92008-05-05 20:21:38 +0000254 raise ValueError(errmsg("Expecting , delimiter", s, end))
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000255
256 try:
257 if s[end] in _ws:
258 end += 1
259 if s[end] in _ws:
260 end = _w(s, end + 1).end()
261 except IndexError:
262 pass
263
Brett Cannon4b964f92008-05-05 20:21:38 +0000264 return values, end
Brett Cannon4b964f92008-05-05 20:21:38 +0000265
266class JSONDecoder(object):
267 """Simple JSON <http://json.org> decoder
268
269 Performs the following translations in decoding by default:
270
271 +---------------+-------------------+
272 | JSON | Python |
273 +===============+===================+
274 | object | dict |
275 +---------------+-------------------+
276 | array | list |
277 +---------------+-------------------+
278 | string | unicode |
279 +---------------+-------------------+
280 | number (int) | int, long |
281 +---------------+-------------------+
282 | number (real) | float |
283 +---------------+-------------------+
284 | true | True |
285 +---------------+-------------------+
286 | false | False |
287 +---------------+-------------------+
288 | null | None |
289 +---------------+-------------------+
290
291 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
292 their corresponding ``float`` values, which is outside the JSON spec.
Brett Cannon4b964f92008-05-05 20:21:38 +0000293
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000294 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000295
296 def __init__(self, encoding=None, object_hook=None, parse_float=None,
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000297 parse_int=None, parse_constant=None, strict=True,
298 object_pairs_hook=None):
Brett Cannon4b964f92008-05-05 20:21:38 +0000299 """``encoding`` determines the encoding used to interpret any ``str``
300 objects decoded by this instance (utf-8 by default). It has no
301 effect when decoding ``unicode`` objects.
302
303 Note that currently only encodings that are a superset of ASCII work,
304 strings of other encodings should be passed in as ``unicode``.
305
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000306 ``object_hook``, if specified, will be called with the result
307 of every JSON object decoded and its return value will be used in
Brett Cannon4b964f92008-05-05 20:21:38 +0000308 place of the given ``dict``. This can be used to provide custom
309 deserializations (e.g. to support JSON-RPC class hinting).
310
311 ``parse_float``, if specified, will be called with the string
312 of every JSON float to be decoded. By default this is equivalent to
313 float(num_str). This can be used to use another datatype or parser
314 for JSON floats (e.g. decimal.Decimal).
315
316 ``parse_int``, if specified, will be called with the string
317 of every JSON int to be decoded. By default this is equivalent to
318 int(num_str). This can be used to use another datatype or parser
319 for JSON integers (e.g. float).
320
321 ``parse_constant``, if specified, will be called with one of the
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000322 following strings: -Infinity, Infinity, NaN.
Brett Cannon4b964f92008-05-05 20:21:38 +0000323 This can be used to raise an exception if invalid JSON numbers
324 are encountered.
325
326 """
327 self.encoding = encoding
328 self.object_hook = object_hook
Raymond Hettinger91852ca2009-03-19 19:19:03 +0000329 self.object_pairs_hook = object_pairs_hook
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000330 self.parse_float = parse_float or float
331 self.parse_int = parse_int or int
332 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Brett Cannon4b964f92008-05-05 20:21:38 +0000333 self.strict = strict
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000334 self.parse_object = JSONObject
335 self.parse_array = JSONArray
336 self.parse_string = scanstring
337 self.scan_once = make_scanner(self)
Brett Cannon4b964f92008-05-05 20:21:38 +0000338
339 def decode(self, s, _w=WHITESPACE.match):
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000340 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
Brett Cannon4b964f92008-05-05 20:21:38 +0000341 instance containing a JSON document)
342
343 """
344 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
345 end = _w(s, end).end()
346 if end != len(s):
347 raise ValueError(errmsg("Extra data", s, end, len(s)))
348 return obj
349
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000350 def raw_decode(self, s, idx=0):
351 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
352 beginning with a JSON document) and return a 2-tuple of the Python
Brett Cannon4b964f92008-05-05 20:21:38 +0000353 representation and the index in ``s`` where the document ended.
354
355 This can be used to decode a JSON document from a string that may
356 have extraneous data at the end.
357
358 """
Brett Cannon4b964f92008-05-05 20:21:38 +0000359 try:
Bob Ippolitod914e3f2009-03-17 23:19:00 +0000360 obj, end = self.scan_once(s, idx)
Brett Cannon4b964f92008-05-05 20:21:38 +0000361 except StopIteration:
362 raise ValueError("No JSON object could be decoded")
363 return obj, end