blob: 88510a76d903bbd3618733844a84c6742fb474fd [file] [log] [blame]
Brett Cannon4b964f92008-05-05 20:21:38 +00001#include "Python.h"
2
3#define DEFAULT_ENCODING "utf-8"
4#define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
5#define MIN_EXPANSION 6
6
7#ifdef Py_UNICODE_WIDE
8#define MAX_EXPANSION (2 * MIN_EXPANSION)
9#else
10#define MAX_EXPANSION MIN_EXPANSION
11#endif
12
13static Py_ssize_t
14ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars)
15{
16 Py_UNICODE x;
17 output[chars++] = '\\';
18 switch (c) {
19 case '\\': output[chars++] = (char)c; break;
20 case '"': output[chars++] = (char)c; break;
21 case '\b': output[chars++] = 'b'; break;
22 case '\f': output[chars++] = 'f'; break;
23 case '\n': output[chars++] = 'n'; break;
24 case '\r': output[chars++] = 'r'; break;
25 case '\t': output[chars++] = 't'; break;
26 default:
27#ifdef Py_UNICODE_WIDE
28 if (c >= 0x10000) {
29 /* UTF-16 surrogate pair */
30 Py_UNICODE v = c - 0x10000;
31 c = 0xd800 | ((v >> 10) & 0x3ff);
32 output[chars++] = 'u';
33 x = (c & 0xf000) >> 12;
34 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
35 x = (c & 0x0f00) >> 8;
36 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
37 x = (c & 0x00f0) >> 4;
38 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
39 x = (c & 0x000f);
40 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
41 c = 0xdc00 | (v & 0x3ff);
42 output[chars++] = '\\';
43 }
44#endif
45 output[chars++] = 'u';
46 x = (c & 0xf000) >> 12;
47 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
48 x = (c & 0x0f00) >> 8;
49 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
50 x = (c & 0x00f0) >> 4;
51 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
52 x = (c & 0x000f);
53 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
54 }
55 return chars;
56}
57
58static PyObject *
59ascii_escape_unicode(PyObject *pystr)
60{
61 Py_ssize_t i;
62 Py_ssize_t input_chars;
63 Py_ssize_t output_size;
64 Py_ssize_t chars;
65 PyObject *rval;
66 char *output;
67 Py_UNICODE *input_unicode;
68
69 input_chars = PyUnicode_GET_SIZE(pystr);
70 input_unicode = PyUnicode_AS_UNICODE(pystr);
71 /* One char input can be up to 6 chars output, estimate 4 of these */
72 output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
Gregory P. Smithdd96db62008-06-09 04:58:54 +000073 rval = PyString_FromStringAndSize(NULL, output_size);
Brett Cannon4b964f92008-05-05 20:21:38 +000074 if (rval == NULL) {
75 return NULL;
76 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +000077 output = PyString_AS_STRING(rval);
Brett Cannon4b964f92008-05-05 20:21:38 +000078 chars = 0;
79 output[chars++] = '"';
80 for (i = 0; i < input_chars; i++) {
81 Py_UNICODE c = input_unicode[i];
82 if (S_CHAR(c)) {
83 output[chars++] = (char)c;
84 }
85 else {
86 chars = ascii_escape_char(c, output, chars);
87 }
88 if (output_size - chars < (1 + MAX_EXPANSION)) {
89 /* There's more than four, so let's resize by a lot */
90 output_size *= 2;
91 /* This is an upper bound */
92 if (output_size > 2 + (input_chars * MAX_EXPANSION)) {
93 output_size = 2 + (input_chars * MAX_EXPANSION);
94 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +000095 if (_PyString_Resize(&rval, output_size) == -1) {
Brett Cannon4b964f92008-05-05 20:21:38 +000096 return NULL;
97 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +000098 output = PyString_AS_STRING(rval);
Brett Cannon4b964f92008-05-05 20:21:38 +000099 }
100 }
101 output[chars++] = '"';
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000102 if (_PyString_Resize(&rval, chars) == -1) {
Brett Cannon4b964f92008-05-05 20:21:38 +0000103 return NULL;
104 }
105 return rval;
106}
107
108static PyObject *
109ascii_escape_str(PyObject *pystr)
110{
111 Py_ssize_t i;
112 Py_ssize_t input_chars;
113 Py_ssize_t output_size;
114 Py_ssize_t chars;
115 PyObject *rval;
116 char *output;
117 char *input_str;
118
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000119 input_chars = PyString_GET_SIZE(pystr);
120 input_str = PyString_AS_STRING(pystr);
Brett Cannon4b964f92008-05-05 20:21:38 +0000121 /* One char input can be up to 6 chars output, estimate 4 of these */
122 output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000123 rval = PyString_FromStringAndSize(NULL, output_size);
Brett Cannon4b964f92008-05-05 20:21:38 +0000124 if (rval == NULL) {
125 return NULL;
126 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000127 output = PyString_AS_STRING(rval);
Brett Cannon4b964f92008-05-05 20:21:38 +0000128 chars = 0;
129 output[chars++] = '"';
130 for (i = 0; i < input_chars; i++) {
131 Py_UNICODE c = (Py_UNICODE)input_str[i];
132 if (S_CHAR(c)) {
133 output[chars++] = (char)c;
134 }
135 else if (c > 0x7F) {
136 /* We hit a non-ASCII character, bail to unicode mode */
137 PyObject *uni;
138 Py_DECREF(rval);
139 uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
140 if (uni == NULL) {
141 return NULL;
142 }
143 rval = ascii_escape_unicode(uni);
144 Py_DECREF(uni);
145 return rval;
146 }
147 else {
148 chars = ascii_escape_char(c, output, chars);
149 }
150 /* An ASCII char can't possibly expand to a surrogate! */
151 if (output_size - chars < (1 + MIN_EXPANSION)) {
152 /* There's more than four, so let's resize by a lot */
153 output_size *= 2;
154 if (output_size > 2 + (input_chars * MIN_EXPANSION)) {
155 output_size = 2 + (input_chars * MIN_EXPANSION);
156 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000157 if (_PyString_Resize(&rval, output_size) == -1) {
Brett Cannon4b964f92008-05-05 20:21:38 +0000158 return NULL;
159 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000160 output = PyString_AS_STRING(rval);
Brett Cannon4b964f92008-05-05 20:21:38 +0000161 }
162 }
163 output[chars++] = '"';
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000164 if (_PyString_Resize(&rval, chars) == -1) {
Brett Cannon4b964f92008-05-05 20:21:38 +0000165 return NULL;
166 }
167 return rval;
168}
169
170void
171raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
172{
173 static PyObject *errmsg_fn = NULL;
174 PyObject *pymsg;
175 if (errmsg_fn == NULL) {
176 PyObject *decoder = PyImport_ImportModule("json.decoder");
177 if (decoder == NULL)
178 return;
179 errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
180 if (errmsg_fn == NULL)
181 return;
182 Py_XDECREF(decoder);
183 }
184 pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
185 PyErr_SetObject(PyExc_ValueError, pymsg);
186 Py_DECREF(pymsg);
187/*
188
189def linecol(doc, pos):
190 lineno = doc.count('\n', 0, pos) + 1
191 if lineno == 1:
192 colno = pos
193 else:
194 colno = pos - doc.rindex('\n', 0, pos)
195 return lineno, colno
196
197def errmsg(msg, doc, pos, end=None):
198 lineno, colno = linecol(doc, pos)
199 if end is None:
200 return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
201 endlineno, endcolno = linecol(doc, end)
202 return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
203 msg, lineno, colno, endlineno, endcolno, pos, end)
204
205*/
206}
207
208static PyObject *
209join_list_unicode(PyObject *lst)
210{
211 static PyObject *ustr = NULL;
212 static PyObject *joinstr = NULL;
213 if (ustr == NULL) {
214 Py_UNICODE c = 0;
215 ustr = PyUnicode_FromUnicode(&c, 0);
216 }
217 if (joinstr == NULL) {
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000218 joinstr = PyString_InternFromString("join");
Brett Cannon4b964f92008-05-05 20:21:38 +0000219 }
220 if (joinstr == NULL || ustr == NULL) {
221 return NULL;
222 }
223 return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
224}
225
226static PyObject *
227scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict)
228{
229 PyObject *rval;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000230 Py_ssize_t len = PyString_GET_SIZE(pystr);
Brett Cannon4b964f92008-05-05 20:21:38 +0000231 Py_ssize_t begin = end - 1;
232 Py_ssize_t next = begin;
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000233 char *buf = PyString_AS_STRING(pystr);
Brett Cannon4b964f92008-05-05 20:21:38 +0000234 PyObject *chunks = PyList_New(0);
235 if (chunks == NULL) {
236 goto bail;
237 }
Bob Ippolitod648f642008-07-19 21:59:50 +0000238 if (end < 0 || len <= end) {
239 PyErr_SetString(PyExc_ValueError, "end is out of bounds");
240 goto bail;
241 }
Brett Cannon4b964f92008-05-05 20:21:38 +0000242 while (1) {
243 /* Find the end of the string or the next escape */
244 Py_UNICODE c = 0;
245 PyObject *chunk = NULL;
246 for (next = end; next < len; next++) {
247 c = buf[next];
248 if (c == '"' || c == '\\') {
249 break;
250 }
251 else if (strict && c <= 0x1f) {
Bob Ippolitod648f642008-07-19 21:59:50 +0000252 raise_errmsg("Invalid control character at", pystr, next);
Brett Cannon4b964f92008-05-05 20:21:38 +0000253 goto bail;
254 }
255 }
256 if (!(c == '"' || c == '\\')) {
257 raise_errmsg("Unterminated string starting at", pystr, begin);
258 goto bail;
259 }
260 /* Pick up this chunk if it's not zero length */
261 if (next != end) {
262 PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end);
263 if (strchunk == NULL) {
264 goto bail;
265 }
266 chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
267 Py_DECREF(strchunk);
268 if (chunk == NULL) {
269 goto bail;
270 }
271 if (PyList_Append(chunks, chunk)) {
272 goto bail;
273 }
274 Py_DECREF(chunk);
275 }
276 next++;
277 if (c == '"') {
278 end = next;
279 break;
280 }
281 if (next == len) {
282 raise_errmsg("Unterminated string starting at", pystr, begin);
283 goto bail;
284 }
285 c = buf[next];
286 if (c != 'u') {
287 /* Non-unicode backslash escapes */
288 end = next + 1;
289 switch (c) {
290 case '"': break;
291 case '\\': break;
292 case '/': break;
293 case 'b': c = '\b'; break;
294 case 'f': c = '\f'; break;
295 case 'n': c = '\n'; break;
296 case 'r': c = '\r'; break;
297 case 't': c = '\t'; break;
298 default: c = 0;
299 }
300 if (c == 0) {
301 raise_errmsg("Invalid \\escape", pystr, end - 2);
302 goto bail;
303 }
304 }
305 else {
306 c = 0;
307 next++;
308 end = next + 4;
309 if (end >= len) {
310 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
311 goto bail;
312 }
313 /* Decode 4 hex digits */
314 for (; next < end; next++) {
315 Py_ssize_t shl = (end - next - 1) << 2;
316 Py_UNICODE digit = buf[next];
317 switch (digit) {
318 case '0': case '1': case '2': case '3': case '4':
319 case '5': case '6': case '7': case '8': case '9':
320 c |= (digit - '0') << shl; break;
321 case 'a': case 'b': case 'c': case 'd': case 'e':
322 case 'f':
323 c |= (digit - 'a' + 10) << shl; break;
324 case 'A': case 'B': case 'C': case 'D': case 'E':
325 case 'F':
326 c |= (digit - 'A' + 10) << shl; break;
327 default:
328 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
329 goto bail;
330 }
331 }
332#ifdef Py_UNICODE_WIDE
333 /* Surrogate pair */
334 if (c >= 0xd800 && c <= 0xdbff) {
335 Py_UNICODE c2 = 0;
336 if (end + 6 >= len) {
337 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
338 end - 5);
339 }
340 if (buf[next++] != '\\' || buf[next++] != 'u') {
341 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
342 end - 5);
343 }
344 end += 6;
345 /* Decode 4 hex digits */
346 for (; next < end; next++) {
347 Py_ssize_t shl = (end - next - 1) << 2;
348 Py_UNICODE digit = buf[next];
349 switch (digit) {
350 case '0': case '1': case '2': case '3': case '4':
351 case '5': case '6': case '7': case '8': case '9':
352 c2 |= (digit - '0') << shl; break;
353 case 'a': case 'b': case 'c': case 'd': case 'e':
354 case 'f':
355 c2 |= (digit - 'a' + 10) << shl; break;
356 case 'A': case 'B': case 'C': case 'D': case 'E':
357 case 'F':
358 c2 |= (digit - 'A' + 10) << shl; break;
359 default:
360 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
361 goto bail;
362 }
363 }
364 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
365 }
366#endif
367 }
368 chunk = PyUnicode_FromUnicode(&c, 1);
369 if (chunk == NULL) {
370 goto bail;
371 }
372 if (PyList_Append(chunks, chunk)) {
373 goto bail;
374 }
375 Py_DECREF(chunk);
376 }
377
378 rval = join_list_unicode(chunks);
379 if (rval == NULL) {
380 goto bail;
381 }
382 Py_DECREF(chunks);
383 chunks = NULL;
384 return Py_BuildValue("(Nn)", rval, end);
385bail:
386 Py_XDECREF(chunks);
387 return NULL;
388}
389
390
391static PyObject *
392scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict)
393{
394 PyObject *rval;
395 Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
396 Py_ssize_t begin = end - 1;
397 Py_ssize_t next = begin;
398 const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
399 PyObject *chunks = PyList_New(0);
400 if (chunks == NULL) {
401 goto bail;
402 }
Bob Ippolitod648f642008-07-19 21:59:50 +0000403 if (end < 0 || len <= end) {
404 PyErr_SetString(PyExc_ValueError, "end is out of bounds");
405 goto bail;
406 }
Brett Cannon4b964f92008-05-05 20:21:38 +0000407 while (1) {
408 /* Find the end of the string or the next escape */
409 Py_UNICODE c = 0;
410 PyObject *chunk = NULL;
411 for (next = end; next < len; next++) {
412 c = buf[next];
413 if (c == '"' || c == '\\') {
414 break;
415 }
416 else if (strict && c <= 0x1f) {
Bob Ippolitod648f642008-07-19 21:59:50 +0000417 raise_errmsg("Invalid control character at", pystr, next);
Brett Cannon4b964f92008-05-05 20:21:38 +0000418 goto bail;
419 }
420 }
421 if (!(c == '"' || c == '\\')) {
422 raise_errmsg("Unterminated string starting at", pystr, begin);
423 goto bail;
424 }
425 /* Pick up this chunk if it's not zero length */
426 if (next != end) {
427 chunk = PyUnicode_FromUnicode(&buf[end], next - end);
428 if (chunk == NULL) {
429 goto bail;
430 }
431 if (PyList_Append(chunks, chunk)) {
432 goto bail;
433 }
434 Py_DECREF(chunk);
435 }
436 next++;
437 if (c == '"') {
438 end = next;
439 break;
440 }
441 if (next == len) {
442 raise_errmsg("Unterminated string starting at", pystr, begin);
443 goto bail;
444 }
445 c = buf[next];
446 if (c != 'u') {
447 /* Non-unicode backslash escapes */
448 end = next + 1;
449 switch (c) {
450 case '"': break;
451 case '\\': break;
452 case '/': break;
453 case 'b': c = '\b'; break;
454 case 'f': c = '\f'; break;
455 case 'n': c = '\n'; break;
456 case 'r': c = '\r'; break;
457 case 't': c = '\t'; break;
458 default: c = 0;
459 }
460 if (c == 0) {
461 raise_errmsg("Invalid \\escape", pystr, end - 2);
462 goto bail;
463 }
464 }
465 else {
466 c = 0;
467 next++;
468 end = next + 4;
469 if (end >= len) {
470 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
471 goto bail;
472 }
473 /* Decode 4 hex digits */
474 for (; next < end; next++) {
475 Py_ssize_t shl = (end - next - 1) << 2;
476 Py_UNICODE digit = buf[next];
477 switch (digit) {
478 case '0': case '1': case '2': case '3': case '4':
479 case '5': case '6': case '7': case '8': case '9':
480 c |= (digit - '0') << shl; break;
481 case 'a': case 'b': case 'c': case 'd': case 'e':
482 case 'f':
483 c |= (digit - 'a' + 10) << shl; break;
484 case 'A': case 'B': case 'C': case 'D': case 'E':
485 case 'F':
486 c |= (digit - 'A' + 10) << shl; break;
487 default:
488 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
489 goto bail;
490 }
491 }
492#ifdef Py_UNICODE_WIDE
493 /* Surrogate pair */
494 if (c >= 0xd800 && c <= 0xdbff) {
495 Py_UNICODE c2 = 0;
496 if (end + 6 >= len) {
497 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
498 end - 5);
499 }
500 if (buf[next++] != '\\' || buf[next++] != 'u') {
501 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
502 end - 5);
503 }
504 end += 6;
505 /* Decode 4 hex digits */
506 for (; next < end; next++) {
507 Py_ssize_t shl = (end - next - 1) << 2;
508 Py_UNICODE digit = buf[next];
509 switch (digit) {
510 case '0': case '1': case '2': case '3': case '4':
511 case '5': case '6': case '7': case '8': case '9':
512 c2 |= (digit - '0') << shl; break;
513 case 'a': case 'b': case 'c': case 'd': case 'e':
514 case 'f':
515 c2 |= (digit - 'a' + 10) << shl; break;
516 case 'A': case 'B': case 'C': case 'D': case 'E':
517 case 'F':
518 c2 |= (digit - 'A' + 10) << shl; break;
519 default:
520 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
521 goto bail;
522 }
523 }
524 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
525 }
526#endif
527 }
528 chunk = PyUnicode_FromUnicode(&c, 1);
529 if (chunk == NULL) {
530 goto bail;
531 }
532 if (PyList_Append(chunks, chunk)) {
533 goto bail;
534 }
535 Py_DECREF(chunk);
536 }
537
538 rval = join_list_unicode(chunks);
539 if (rval == NULL) {
540 goto bail;
541 }
542 Py_DECREF(chunks);
543 chunks = NULL;
544 return Py_BuildValue("(Nn)", rval, end);
545bail:
546 Py_XDECREF(chunks);
547 return NULL;
548}
549
550PyDoc_STRVAR(pydoc_scanstring,
551"scanstring(basestring, end, encoding) -> (str, end)\n");
552
553static PyObject *
554py_scanstring(PyObject* self, PyObject *args)
555{
556 PyObject *pystr;
557 Py_ssize_t end;
558 char *encoding = NULL;
559 int strict = 0;
560 if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) {
561 return NULL;
562 }
563 if (encoding == NULL) {
564 encoding = DEFAULT_ENCODING;
565 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000566 if (PyString_Check(pystr)) {
Brett Cannon4b964f92008-05-05 20:21:38 +0000567 return scanstring_str(pystr, end, encoding, strict);
568 }
569 else if (PyUnicode_Check(pystr)) {
570 return scanstring_unicode(pystr, end, strict);
571 }
572 else {
573 PyErr_Format(PyExc_TypeError,
574 "first argument must be a string or unicode, not %.80s",
575 Py_TYPE(pystr)->tp_name);
576 return NULL;
577 }
578}
579
580PyDoc_STRVAR(pydoc_encode_basestring_ascii,
581"encode_basestring_ascii(basestring) -> str\n");
582
583static PyObject *
584py_encode_basestring_ascii(PyObject* self, PyObject *pystr)
585{
586 /* METH_O */
Gregory P. Smithdd96db62008-06-09 04:58:54 +0000587 if (PyString_Check(pystr)) {
Brett Cannon4b964f92008-05-05 20:21:38 +0000588 return ascii_escape_str(pystr);
589 }
590 else if (PyUnicode_Check(pystr)) {
591 return ascii_escape_unicode(pystr);
592 }
593 else {
594 PyErr_Format(PyExc_TypeError,
595 "first argument must be a string or unicode, not %.80s",
596 Py_TYPE(pystr)->tp_name);
597 return NULL;
598 }
599}
600
601static PyMethodDef json_methods[] = {
602 {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii,
603 METH_O, pydoc_encode_basestring_ascii},
604 {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS,
605 pydoc_scanstring},
606 {NULL, NULL, 0, NULL}
607};
608
609PyDoc_STRVAR(module_doc,
610"json speedups\n");
611
612void
613init_json(void)
614{
615 PyObject *m;
616 m = Py_InitModule3("_json", json_methods, module_doc);
617}