blob: 59e5f41f4dc1b5837b51a183cb1ba781b7c9d6fd [file] [log] [blame]
Christian Heimes90540002008-05-08 14:29:10 +00001"""Implementation of JSONDecoder
2"""
Christian Heimes90540002008-05-08 14:29:10 +00003import re
Christian Heimes90540002008-05-08 14:29:10 +00004
Ezio Melotti6b60fb92011-05-14 06:47:51 +03005from json import scanner
Christian Heimes90540002008-05-08 14:29:10 +00006try:
7 from _json import scanstring as c_scanstring
Brett Cannoncd171c82013-07-04 17:43:24 -04008except ImportError:
Christian Heimes90540002008-05-08 14:29:10 +00009 c_scanstring = None
10
11__all__ = ['JSONDecoder']
12
13FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
14
Victor Stinnerd7fed372012-11-29 00:12:40 +010015NaN = float('nan')
16PosInf = float('inf')
17NegInf = float('-inf')
Christian Heimes90540002008-05-08 14:29:10 +000018
19
20def linecol(doc, pos):
Benjamin Petersona13d4752008-10-16 21:17:24 +000021 if isinstance(doc, bytes):
22 newline = b'\n'
23 else:
24 newline = '\n'
25 lineno = doc.count(newline, 0, pos) + 1
Christian Heimes90540002008-05-08 14:29:10 +000026 if lineno == 1:
Serhiy Storchakac510a042013-02-21 20:19:16 +020027 colno = pos + 1
Christian Heimes90540002008-05-08 14:29:10 +000028 else:
Benjamin Petersona13d4752008-10-16 21:17:24 +000029 colno = pos - doc.rindex(newline, 0, pos)
Christian Heimes90540002008-05-08 14:29:10 +000030 return lineno, colno
31
32
33def errmsg(msg, doc, pos, end=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000034 # Note that this function is called from _json
Christian Heimes90540002008-05-08 14:29:10 +000035 lineno, colno = linecol(doc, pos)
36 if end is None:
37 fmt = '{0}: line {1} column {2} (char {3})'
38 return fmt.format(msg, lineno, colno, pos)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000039 #fmt = '%s: line %d column %d (char %d)'
40 #return fmt % (msg, lineno, colno, pos)
Christian Heimes90540002008-05-08 14:29:10 +000041 endlineno, endcolno = linecol(doc, end)
42 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
43 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000044 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
45 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
Christian Heimes90540002008-05-08 14:29:10 +000046
47
48_CONSTANTS = {
49 '-Infinity': NegInf,
50 'Infinity': PosInf,
51 'NaN': NaN,
Christian Heimes90540002008-05-08 14:29:10 +000052}
53
54
Christian Heimes90540002008-05-08 14:29:10 +000055STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
56BACKSLASH = {
57 '"': '"', '\\': '\\', '/': '/',
58 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
59}
60
Serhiy Storchakac93329b2013-11-26 21:25:28 +020061def _decode_uXXXX(s, pos):
62 esc = s[pos + 1:pos + 5]
63 if len(esc) == 4 and esc[1] not in 'xX':
64 try:
65 return int(esc, 16)
66 except ValueError:
67 pass
68 msg = "Invalid \\uXXXX escape"
69 raise ValueError(errmsg(msg, s, pos))
70
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000071def py_scanstring(s, end, strict=True,
72 _b=BACKSLASH, _m=STRINGCHUNK.match):
73 """Scan the string s for a JSON string. End is the index of the
74 character in s after the quote that started the JSON string.
75 Unescapes all valid JSON string escape sequences and raises ValueError
76 on attempt to decode an invalid string. If strict is False then literal
77 control characters are allowed in the string.
Christian Heimes90540002008-05-08 14:29:10 +000078
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000079 Returns a tuple of the decoded string and the index of the character in s
80 after the end quote."""
Christian Heimes90540002008-05-08 14:29:10 +000081 chunks = []
82 _append = chunks.append
83 begin = end - 1
84 while 1:
85 chunk = _m(s, end)
86 if chunk is None:
87 raise ValueError(
88 errmsg("Unterminated string starting at", s, begin))
89 end = chunk.end()
90 content, terminator = chunk.groups()
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000091 # Content is contains zero or more unescaped string characters
Christian Heimes90540002008-05-08 14:29:10 +000092 if content:
Christian Heimes90540002008-05-08 14:29:10 +000093 _append(content)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +000094 # Terminator is the end of string, a literal control character,
95 # or a backslash denoting that an escape sequence follows
Christian Heimes90540002008-05-08 14:29:10 +000096 if terminator == '"':
97 break
98 elif terminator != '\\':
99 if strict:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000100 #msg = "Invalid control character %r at" % (terminator,)
Christian Heimes90540002008-05-08 14:29:10 +0000101 msg = "Invalid control character {0!r} at".format(terminator)
102 raise ValueError(errmsg(msg, s, end))
103 else:
104 _append(terminator)
105 continue
106 try:
107 esc = s[end]
108 except IndexError:
109 raise ValueError(
110 errmsg("Unterminated string starting at", s, begin))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000111 # If not a unicode escape sequence, must be in the lookup table
Christian Heimes90540002008-05-08 14:29:10 +0000112 if esc != 'u':
113 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000114 char = _b[esc]
Christian Heimes90540002008-05-08 14:29:10 +0000115 except KeyError:
116 msg = "Invalid \\escape: {0!r}".format(esc)
117 raise ValueError(errmsg(msg, s, end))
118 end += 1
119 else:
Serhiy Storchakac93329b2013-11-26 21:25:28 +0200120 uni = _decode_uXXXX(s, end)
121 end += 5
122 if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
123 uni2 = _decode_uXXXX(s, end + 1)
124 if 0xdc00 <= uni2 <= 0xdfff:
125 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
126 end += 6
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000127 char = chr(uni)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000128 _append(char)
Christian Heimes90540002008-05-08 14:29:10 +0000129 return ''.join(chunks), end
130
131
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000132# Use speedup if available
133scanstring = c_scanstring or py_scanstring
Christian Heimes90540002008-05-08 14:29:10 +0000134
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000135WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
136WHITESPACE_STR = ' \t\n\r'
Christian Heimes90540002008-05-08 14:29:10 +0000137
138
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000139def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000140 memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000141 s, end = s_and_end
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000142 pairs = []
143 pairs_append = pairs.append
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000144 # Backwards compatibility
145 if memo is None:
146 memo = {}
147 memo_get = memo.setdefault
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000148 # Use a slice to prevent IndexError from being raised, the following
149 # check will raise a more specific ValueError if the string is empty
Christian Heimes90540002008-05-08 14:29:10 +0000150 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000151 # Normally we expect nextchar == '"'
Christian Heimes90540002008-05-08 14:29:10 +0000152 if nextchar != '"':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000153 if nextchar in _ws:
154 end = _w(s, end).end()
155 nextchar = s[end:end + 1]
156 # Trivial empty object
157 if nextchar == '}':
Ezio Melottid210aa12011-04-13 07:10:13 +0300158 if object_pairs_hook is not None:
159 result = object_pairs_hook(pairs)
Ezio Melottia7d64a62013-03-13 01:52:34 +0200160 return result, end + 1
Ezio Melottid210aa12011-04-13 07:10:13 +0300161 pairs = {}
162 if object_hook is not None:
163 pairs = object_hook(pairs)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000164 return pairs, end + 1
165 elif nextchar != '"':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200166 raise ValueError(errmsg(
167 "Expecting property name enclosed in double quotes", s, end))
Christian Heimes90540002008-05-08 14:29:10 +0000168 end += 1
Christian Heimes90540002008-05-08 14:29:10 +0000169 while True:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000170 key, end = scanstring(s, end, strict)
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000171 key = memo_get(key, key)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000172 # To skip some function call overhead we optimize the fast paths where
173 # the JSON key separator is ": " or just ":".
Christian Heimes90540002008-05-08 14:29:10 +0000174 if s[end:end + 1] != ':':
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000175 end = _w(s, end).end()
176 if s[end:end + 1] != ':':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200177 raise ValueError(errmsg("Expecting ':' delimiter", s, end))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000178 end += 1
179
Christian Heimes90540002008-05-08 14:29:10 +0000180 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000181 if s[end] in _ws:
182 end += 1
183 if s[end] in _ws:
184 end = _w(s, end + 1).end()
185 except IndexError:
186 pass
187
188 try:
189 value, end = scan_once(s, end)
Ezio Melotti37623ab2013-01-03 08:44:15 +0200190 except StopIteration as err:
191 raise ValueError(errmsg("Expecting value", s, err.value)) from None
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000192 pairs_append((key, value))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000193 try:
194 nextchar = s[end]
195 if nextchar in _ws:
196 end = _w(s, end + 1).end()
197 nextchar = s[end]
198 except IndexError:
199 nextchar = ''
Christian Heimes90540002008-05-08 14:29:10 +0000200 end += 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000201
Christian Heimes90540002008-05-08 14:29:10 +0000202 if nextchar == '}':
203 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000204 elif nextchar != ',':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200205 raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
Christian Heimes90540002008-05-08 14:29:10 +0000206 end = _w(s, end).end()
207 nextchar = s[end:end + 1]
208 end += 1
209 if nextchar != '"':
Antoine Pitrou2d24e942012-06-29 01:58:26 +0200210 raise ValueError(errmsg(
211 "Expecting property name enclosed in double quotes", s, end - 1))
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000212 if object_pairs_hook is not None:
213 result = object_pairs_hook(pairs)
214 return result, end
215 pairs = dict(pairs)
Christian Heimes90540002008-05-08 14:29:10 +0000216 if object_hook is not None:
217 pairs = object_hook(pairs)
218 return pairs, end
Christian Heimes90540002008-05-08 14:29:10 +0000219
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000220def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000221 s, end = s_and_end
Christian Heimes90540002008-05-08 14:29:10 +0000222 values = []
Christian Heimes90540002008-05-08 14:29:10 +0000223 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000224 if nextchar in _ws:
225 end = _w(s, end + 1).end()
226 nextchar = s[end:end + 1]
227 # Look-ahead for trivial empty array
Christian Heimes90540002008-05-08 14:29:10 +0000228 if nextchar == ']':
229 return values, end + 1
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000230 _append = values.append
Christian Heimes90540002008-05-08 14:29:10 +0000231 while True:
232 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000233 value, end = scan_once(s, end)
Ezio Melotti37623ab2013-01-03 08:44:15 +0200234 except StopIteration as err:
235 raise ValueError(errmsg("Expecting value", s, err.value)) from None
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000236 _append(value)
Christian Heimes90540002008-05-08 14:29:10 +0000237 nextchar = s[end:end + 1]
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000238 if nextchar in _ws:
239 end = _w(s, end + 1).end()
240 nextchar = s[end:end + 1]
Christian Heimes90540002008-05-08 14:29:10 +0000241 end += 1
242 if nextchar == ']':
243 break
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000244 elif nextchar != ',':
Ezio Melotti37623ab2013-01-03 08:44:15 +0200245 raise ValueError(errmsg("Expecting ',' delimiter", s, end - 1))
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000246 try:
247 if s[end] in _ws:
248 end += 1
249 if s[end] in _ws:
250 end = _w(s, end + 1).end()
251 except IndexError:
252 pass
253
Christian Heimes90540002008-05-08 14:29:10 +0000254 return values, end
Christian Heimes90540002008-05-08 14:29:10 +0000255
256
257class JSONDecoder(object):
258 """Simple JSON <http://json.org> decoder
259
260 Performs the following translations in decoding by default:
261
262 +---------------+-------------------+
263 | JSON | Python |
264 +===============+===================+
265 | object | dict |
266 +---------------+-------------------+
267 | array | list |
268 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000269 | string | str |
Christian Heimes90540002008-05-08 14:29:10 +0000270 +---------------+-------------------+
Georg Brandlc8284cf2010-08-02 20:16:18 +0000271 | number (int) | int |
Christian Heimes90540002008-05-08 14:29:10 +0000272 +---------------+-------------------+
273 | number (real) | float |
274 +---------------+-------------------+
275 | true | True |
276 +---------------+-------------------+
277 | false | False |
278 +---------------+-------------------+
279 | null | None |
280 +---------------+-------------------+
281
282 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
283 their corresponding ``float`` values, which is outside the JSON spec.
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000284
Christian Heimes90540002008-05-08 14:29:10 +0000285 """
286
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000287 def __init__(self, object_hook=None, parse_float=None,
Raymond Hettinger0ad98d82009-04-21 03:09:17 +0000288 parse_int=None, parse_constant=None, strict=True,
289 object_pairs_hook=None):
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000290 """``object_hook``, if specified, will be called with the result
291 of every JSON object decoded and its return value will be used in
Christian Heimes90540002008-05-08 14:29:10 +0000292 place of the given ``dict``. This can be used to provide custom
293 deserializations (e.g. to support JSON-RPC class hinting).
294
Georg Brandld4460aa2010-10-15 17:03:02 +0000295 ``object_pairs_hook``, if specified will be called with the result of
296 every JSON object decoded with an ordered list of pairs. The return
297 value of ``object_pairs_hook`` will be used instead of the ``dict``.
298 This feature can be used to implement custom decoders that rely on the
299 order that the key and value pairs are decoded (for example,
300 collections.OrderedDict will remember the order of insertion). If
301 ``object_hook`` is also defined, the ``object_pairs_hook`` takes
302 priority.
303
Christian Heimes90540002008-05-08 14:29:10 +0000304 ``parse_float``, if specified, will be called with the string
305 of every JSON float to be decoded. By default this is equivalent to
306 float(num_str). This can be used to use another datatype or parser
307 for JSON floats (e.g. decimal.Decimal).
308
309 ``parse_int``, if specified, will be called with the string
310 of every JSON int to be decoded. By default this is equivalent to
311 int(num_str). This can be used to use another datatype or parser
312 for JSON integers (e.g. float).
313
314 ``parse_constant``, if specified, will be called with one of the
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000315 following strings: -Infinity, Infinity, NaN.
Christian Heimes90540002008-05-08 14:29:10 +0000316 This can be used to raise an exception if invalid JSON numbers
317 are encountered.
318
Georg Brandld4460aa2010-10-15 17:03:02 +0000319 If ``strict`` is false (true is the default), then control
320 characters will be allowed inside strings. Control characters in
321 this context are those with character codes in the 0-31 range,
322 including ``'\\t'`` (tab), ``'\\n'``, ``'\\r'`` and ``'\\0'``.
323
Christian Heimes90540002008-05-08 14:29:10 +0000324 """
Christian Heimes90540002008-05-08 14:29:10 +0000325 self.object_hook = object_hook
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000326 self.parse_float = parse_float or float
327 self.parse_int = parse_int or int
328 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
Christian Heimes90540002008-05-08 14:29:10 +0000329 self.strict = strict
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000330 self.object_pairs_hook = object_pairs_hook
331 self.parse_object = JSONObject
332 self.parse_array = JSONArray
333 self.parse_string = scanstring
Antoine Pitrou7d6e0762010-09-04 20:16:53 +0000334 self.memo = {}
Ezio Melotti6b60fb92011-05-14 06:47:51 +0300335 self.scan_once = scanner.make_scanner(self)
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000336
Christian Heimes90540002008-05-08 14:29:10 +0000337
338 def decode(self, s, _w=WHITESPACE.match):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000339 """Return the Python representation of ``s`` (a ``str`` instance
340 containing a JSON document).
Christian Heimes90540002008-05-08 14:29:10 +0000341
342 """
343 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
344 end = _w(s, end).end()
345 if end != len(s):
346 raise ValueError(errmsg("Extra data", s, end, len(s)))
347 return obj
348
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000349 def raw_decode(self, s, idx=0):
Georg Brandlc8284cf2010-08-02 20:16:18 +0000350 """Decode a JSON document from ``s`` (a ``str`` beginning with
351 a JSON document) and return a 2-tuple of the Python
Christian Heimes90540002008-05-08 14:29:10 +0000352 representation and the index in ``s`` where the document ended.
353
354 This can be used to decode a JSON document from a string that may
355 have extraneous data at the end.
356
357 """
Christian Heimes90540002008-05-08 14:29:10 +0000358 try:
Benjamin Petersonc6b607d2009-05-02 12:36:44 +0000359 obj, end = self.scan_once(s, idx)
Ezio Melotti37623ab2013-01-03 08:44:15 +0200360 except StopIteration as err:
361 raise ValueError(errmsg("Expecting value", s, err.value)) from None
Christian Heimes90540002008-05-08 14:29:10 +0000362 return obj, end