blob: a4308fdc7ebaff36cba282671ca8da2e2a425c28 [file] [log] [blame]
Christian Heimes90540002008-05-08 14:29:10 +00001#include "Python.h"
2
3#define DEFAULT_ENCODING "utf-8"
4#define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"')
5#define MIN_EXPANSION 6
6
7#ifdef Py_UNICODE_WIDE
8#define MAX_EXPANSION (2 * MIN_EXPANSION)
9#else
10#define MAX_EXPANSION MIN_EXPANSION
11#endif
12
13static Py_ssize_t
14ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars)
15{
16 Py_UNICODE x;
17 output[chars++] = '\\';
18 switch (c) {
19 case '\\': output[chars++] = (char)c; break;
20 case '"': output[chars++] = (char)c; break;
21 case '\b': output[chars++] = 'b'; break;
22 case '\f': output[chars++] = 'f'; break;
23 case '\n': output[chars++] = 'n'; break;
24 case '\r': output[chars++] = 'r'; break;
25 case '\t': output[chars++] = 't'; break;
26 default:
27#ifdef Py_UNICODE_WIDE
28 if (c >= 0x10000) {
29 /* UTF-16 surrogate pair */
30 Py_UNICODE v = c - 0x10000;
31 c = 0xd800 | ((v >> 10) & 0x3ff);
32 output[chars++] = 'u';
33 x = (c & 0xf000) >> 12;
34 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
35 x = (c & 0x0f00) >> 8;
36 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
37 x = (c & 0x00f0) >> 4;
38 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
39 x = (c & 0x000f);
40 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
41 c = 0xdc00 | (v & 0x3ff);
42 output[chars++] = '\\';
43 }
44#endif
45 output[chars++] = 'u';
46 x = (c & 0xf000) >> 12;
47 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
48 x = (c & 0x0f00) >> 8;
49 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
50 x = (c & 0x00f0) >> 4;
51 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
52 x = (c & 0x000f);
53 output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10);
54 }
55 return chars;
56}
57
58static PyObject *
59ascii_escape_unicode(PyObject *pystr)
60{
61 Py_ssize_t i;
62 Py_ssize_t input_chars;
63 Py_ssize_t output_size;
64 Py_ssize_t chars;
65 PyObject *rval;
66 char *output;
67 Py_UNICODE *input_unicode;
68
69 input_chars = PyUnicode_GET_SIZE(pystr);
70 input_unicode = PyUnicode_AS_UNICODE(pystr);
71 /* One char input can be up to 6 chars output, estimate 4 of these */
72 output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
Christian Heimes72b710a2008-05-26 13:28:38 +000073 rval = PyBytes_FromStringAndSize(NULL, output_size);
Christian Heimes90540002008-05-08 14:29:10 +000074 if (rval == NULL) {
75 return NULL;
76 }
Christian Heimes72b710a2008-05-26 13:28:38 +000077 output = PyBytes_AS_STRING(rval);
Christian Heimes90540002008-05-08 14:29:10 +000078 chars = 0;
79 output[chars++] = '"';
80 for (i = 0; i < input_chars; i++) {
81 Py_UNICODE c = input_unicode[i];
82 if (S_CHAR(c)) {
83 output[chars++] = (char)c;
84 }
85 else {
86 chars = ascii_escape_char(c, output, chars);
87 }
88 if (output_size - chars < (1 + MAX_EXPANSION)) {
89 /* There's more than four, so let's resize by a lot */
90 output_size *= 2;
91 /* This is an upper bound */
92 if (output_size > 2 + (input_chars * MAX_EXPANSION)) {
93 output_size = 2 + (input_chars * MAX_EXPANSION);
94 }
Christian Heimes72b710a2008-05-26 13:28:38 +000095 if (_PyBytes_Resize(&rval, output_size) == -1) {
Christian Heimes90540002008-05-08 14:29:10 +000096 return NULL;
97 }
Christian Heimes72b710a2008-05-26 13:28:38 +000098 output = PyBytes_AS_STRING(rval);
Christian Heimes90540002008-05-08 14:29:10 +000099 }
100 }
101 output[chars++] = '"';
Christian Heimes72b710a2008-05-26 13:28:38 +0000102 if (_PyBytes_Resize(&rval, chars) == -1) {
Christian Heimes90540002008-05-08 14:29:10 +0000103 return NULL;
104 }
105 return rval;
106}
107
108static PyObject *
109ascii_escape_str(PyObject *pystr)
110{
111 Py_ssize_t i;
112 Py_ssize_t input_chars;
113 Py_ssize_t output_size;
114 Py_ssize_t chars;
115 PyObject *rval;
116 char *output;
117 char *input_str;
118
Christian Heimes72b710a2008-05-26 13:28:38 +0000119 input_chars = PyBytes_GET_SIZE(pystr);
120 input_str = PyBytes_AS_STRING(pystr);
Christian Heimes90540002008-05-08 14:29:10 +0000121 /* One char input can be up to 6 chars output, estimate 4 of these */
122 output_size = 2 + (MIN_EXPANSION * 4) + input_chars;
Christian Heimes72b710a2008-05-26 13:28:38 +0000123 rval = PyBytes_FromStringAndSize(NULL, output_size);
Christian Heimes90540002008-05-08 14:29:10 +0000124 if (rval == NULL) {
125 return NULL;
126 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000127 output = PyBytes_AS_STRING(rval);
Christian Heimes90540002008-05-08 14:29:10 +0000128 chars = 0;
129 output[chars++] = '"';
130 for (i = 0; i < input_chars; i++) {
131 Py_UNICODE c = (Py_UNICODE)input_str[i];
132 if (S_CHAR(c)) {
133 output[chars++] = (char)c;
134 }
135 else if (c > 0x7F) {
136 /* We hit a non-ASCII character, bail to unicode mode */
137 PyObject *uni;
138 Py_DECREF(rval);
139 uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict");
140 if (uni == NULL) {
141 return NULL;
142 }
143 rval = ascii_escape_unicode(uni);
144 Py_DECREF(uni);
145 return rval;
146 }
147 else {
148 chars = ascii_escape_char(c, output, chars);
149 }
150 /* An ASCII char can't possibly expand to a surrogate! */
151 if (output_size - chars < (1 + MIN_EXPANSION)) {
152 /* There's more than four, so let's resize by a lot */
153 output_size *= 2;
154 if (output_size > 2 + (input_chars * MIN_EXPANSION)) {
155 output_size = 2 + (input_chars * MIN_EXPANSION);
156 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000157 if (_PyBytes_Resize(&rval, output_size) == -1) {
Christian Heimes90540002008-05-08 14:29:10 +0000158 return NULL;
159 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000160 output = PyBytes_AS_STRING(rval);
Christian Heimes90540002008-05-08 14:29:10 +0000161 }
162 }
163 output[chars++] = '"';
Christian Heimes72b710a2008-05-26 13:28:38 +0000164 if (_PyBytes_Resize(&rval, chars) == -1) {
Christian Heimes90540002008-05-08 14:29:10 +0000165 return NULL;
166 }
167 return rval;
168}
169
170void
171raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
172{
173 static PyObject *errmsg_fn = NULL;
174 PyObject *pymsg;
175 if (errmsg_fn == NULL) {
176 PyObject *decoder = PyImport_ImportModule("json.decoder");
177 if (decoder == NULL)
178 return;
179 errmsg_fn = PyObject_GetAttrString(decoder, "errmsg");
180 if (errmsg_fn == NULL)
181 return;
182 Py_XDECREF(decoder);
183 }
184 pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end);
185 PyErr_SetObject(PyExc_ValueError, pymsg);
186 Py_DECREF(pymsg);
187/*
188
189def linecol(doc, pos):
190 lineno = doc.count('\n', 0, pos) + 1
191 if lineno == 1:
192 colno = pos
193 else:
194 colno = pos - doc.rindex('\n', 0, pos)
195 return lineno, colno
196
197def errmsg(msg, doc, pos, end=None):
198 lineno, colno = linecol(doc, pos)
199 if end is None:
200 return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos)
201 endlineno, endcolno = linecol(doc, end)
202 return '%s: line %d column %d - line %d column %d (char %d - %d)' % (
203 msg, lineno, colno, endlineno, endcolno, pos, end)
204
205*/
206}
207
208static PyObject *
209join_list_unicode(PyObject *lst)
210{
211 static PyObject *ustr = NULL;
212 static PyObject *joinstr = NULL;
213 if (ustr == NULL) {
214 Py_UNICODE c = 0;
215 ustr = PyUnicode_FromUnicode(&c, 0);
216 }
217 if (joinstr == NULL) {
218 joinstr = PyUnicode_InternFromString("join");
219 }
220 if (joinstr == NULL || ustr == NULL) {
221 return NULL;
222 }
223 return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL);
224}
225
226static PyObject *
227scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict)
228{
229 PyObject *rval;
Christian Heimes72b710a2008-05-26 13:28:38 +0000230 Py_ssize_t len = PyBytes_GET_SIZE(pystr);
Christian Heimes90540002008-05-08 14:29:10 +0000231 Py_ssize_t begin = end - 1;
232 Py_ssize_t next = begin;
Christian Heimes72b710a2008-05-26 13:28:38 +0000233 char *buf = PyBytes_AS_STRING(pystr);
Christian Heimes90540002008-05-08 14:29:10 +0000234 Py_buffer info;
235 PyObject *chunks = PyList_New(0);
236 if (chunks == NULL) {
237 goto bail;
238 }
239 while (1) {
240 /* Find the end of the string or the next escape */
241 Py_UNICODE c = 0;
242 PyObject *chunk = NULL;
243 for (next = end; next < len; next++) {
244 c = buf[next];
245 if (c == '"' || c == '\\') {
246 break;
247 }
248 else if (strict && c <= 0x1f) {
249 raise_errmsg("Invalid control character at", pystr, begin);
250 goto bail;
251 }
252 }
253 if (!(c == '"' || c == '\\')) {
254 raise_errmsg("Unterminated string starting at", pystr, begin);
255 goto bail;
256 }
257 /* Pick up this chunk if it's not zero length */
258 if (next != end) {
Amaury Forgeot d'Arccb0cdce2008-05-08 20:56:43 +0000259 PyObject *strchunk;
Christian Heimes90540002008-05-08 14:29:10 +0000260 if (PyBuffer_FillInfo(&info, &buf[end], next - end, 1, 0) < 0) {
261 goto bail;
262 }
Amaury Forgeot d'Arccb0cdce2008-05-08 20:56:43 +0000263 strchunk = PyMemoryView_FromMemory(&info);
Christian Heimes90540002008-05-08 14:29:10 +0000264 if (strchunk == NULL) {
265 goto bail;
266 }
267 chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL);
268 Py_DECREF(strchunk);
269 if (chunk == NULL) {
270 goto bail;
271 }
272 if (PyList_Append(chunks, chunk)) {
273 goto bail;
274 }
275 Py_DECREF(chunk);
276 }
277 next++;
278 if (c == '"') {
279 end = next;
280 break;
281 }
282 if (next == len) {
283 raise_errmsg("Unterminated string starting at", pystr, begin);
284 goto bail;
285 }
286 c = buf[next];
287 if (c != 'u') {
288 /* Non-unicode backslash escapes */
289 end = next + 1;
290 switch (c) {
291 case '"': break;
292 case '\\': break;
293 case '/': break;
294 case 'b': c = '\b'; break;
295 case 'f': c = '\f'; break;
296 case 'n': c = '\n'; break;
297 case 'r': c = '\r'; break;
298 case 't': c = '\t'; break;
299 default: c = 0;
300 }
301 if (c == 0) {
302 raise_errmsg("Invalid \\escape", pystr, end - 2);
303 goto bail;
304 }
305 }
306 else {
307 c = 0;
308 next++;
309 end = next + 4;
310 if (end >= len) {
311 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
312 goto bail;
313 }
314 /* Decode 4 hex digits */
315 for (; next < end; next++) {
316 Py_ssize_t shl = (end - next - 1) << 2;
317 Py_UNICODE digit = buf[next];
318 switch (digit) {
319 case '0': case '1': case '2': case '3': case '4':
320 case '5': case '6': case '7': case '8': case '9':
321 c |= (digit - '0') << shl; break;
322 case 'a': case 'b': case 'c': case 'd': case 'e':
323 case 'f':
324 c |= (digit - 'a' + 10) << shl; break;
325 case 'A': case 'B': case 'C': case 'D': case 'E':
326 case 'F':
327 c |= (digit - 'A' + 10) << shl; break;
328 default:
329 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
330 goto bail;
331 }
332 }
333#ifdef Py_UNICODE_WIDE
334 /* Surrogate pair */
335 if (c >= 0xd800 && c <= 0xdbff) {
336 Py_UNICODE c2 = 0;
337 if (end + 6 >= len) {
338 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
339 end - 5);
340 }
341 if (buf[next++] != '\\' || buf[next++] != 'u') {
342 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
343 end - 5);
344 }
345 end += 6;
346 /* Decode 4 hex digits */
347 for (; next < end; next++) {
348 Py_ssize_t shl = (end - next - 1) << 2;
349 Py_UNICODE digit = buf[next];
350 switch (digit) {
351 case '0': case '1': case '2': case '3': case '4':
352 case '5': case '6': case '7': case '8': case '9':
353 c2 |= (digit - '0') << shl; break;
354 case 'a': case 'b': case 'c': case 'd': case 'e':
355 case 'f':
356 c2 |= (digit - 'a' + 10) << shl; break;
357 case 'A': case 'B': case 'C': case 'D': case 'E':
358 case 'F':
359 c2 |= (digit - 'A' + 10) << shl; break;
360 default:
361 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
362 goto bail;
363 }
364 }
365 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
366 }
367#endif
368 }
369 chunk = PyUnicode_FromUnicode(&c, 1);
370 if (chunk == NULL) {
371 goto bail;
372 }
373 if (PyList_Append(chunks, chunk)) {
374 goto bail;
375 }
376 Py_DECREF(chunk);
377 }
378
379 rval = join_list_unicode(chunks);
380 if (rval == NULL) {
381 goto bail;
382 }
383 Py_DECREF(chunks);
384 chunks = NULL;
385 return Py_BuildValue("(Nn)", rval, end);
386bail:
387 Py_XDECREF(chunks);
388 return NULL;
389}
390
391
392static PyObject *
393scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict)
394{
395 PyObject *rval;
396 Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
397 Py_ssize_t begin = end - 1;
398 Py_ssize_t next = begin;
399 const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
400 PyObject *chunks = PyList_New(0);
401 if (chunks == NULL) {
402 goto bail;
403 }
404 while (1) {
405 /* Find the end of the string or the next escape */
406 Py_UNICODE c = 0;
407 PyObject *chunk = NULL;
408 for (next = end; next < len; next++) {
409 c = buf[next];
410 if (c == '"' || c == '\\') {
411 break;
412 }
413 else if (strict && c <= 0x1f) {
414 raise_errmsg("Invalid control character at", pystr, begin);
415 goto bail;
416 }
417 }
418 if (!(c == '"' || c == '\\')) {
419 raise_errmsg("Unterminated string starting at", pystr, begin);
420 goto bail;
421 }
422 /* Pick up this chunk if it's not zero length */
423 if (next != end) {
424 chunk = PyUnicode_FromUnicode(&buf[end], next - end);
425 if (chunk == NULL) {
426 goto bail;
427 }
428 if (PyList_Append(chunks, chunk)) {
429 goto bail;
430 }
431 Py_DECREF(chunk);
432 }
433 next++;
434 if (c == '"') {
435 end = next;
436 break;
437 }
438 if (next == len) {
439 raise_errmsg("Unterminated string starting at", pystr, begin);
440 goto bail;
441 }
442 c = buf[next];
443 if (c != 'u') {
444 /* Non-unicode backslash escapes */
445 end = next + 1;
446 switch (c) {
447 case '"': break;
448 case '\\': break;
449 case '/': break;
450 case 'b': c = '\b'; break;
451 case 'f': c = '\f'; break;
452 case 'n': c = '\n'; break;
453 case 'r': c = '\r'; break;
454 case 't': c = '\t'; break;
455 default: c = 0;
456 }
457 if (c == 0) {
458 raise_errmsg("Invalid \\escape", pystr, end - 2);
459 goto bail;
460 }
461 }
462 else {
463 c = 0;
464 next++;
465 end = next + 4;
466 if (end >= len) {
467 raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1);
468 goto bail;
469 }
470 /* Decode 4 hex digits */
471 for (; next < end; next++) {
472 Py_ssize_t shl = (end - next - 1) << 2;
473 Py_UNICODE digit = buf[next];
474 switch (digit) {
475 case '0': case '1': case '2': case '3': case '4':
476 case '5': case '6': case '7': case '8': case '9':
477 c |= (digit - '0') << shl; break;
478 case 'a': case 'b': case 'c': case 'd': case 'e':
479 case 'f':
480 c |= (digit - 'a' + 10) << shl; break;
481 case 'A': case 'B': case 'C': case 'D': case 'E':
482 case 'F':
483 c |= (digit - 'A' + 10) << shl; break;
484 default:
485 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
486 goto bail;
487 }
488 }
489#ifdef Py_UNICODE_WIDE
490 /* Surrogate pair */
491 if (c >= 0xd800 && c <= 0xdbff) {
492 Py_UNICODE c2 = 0;
493 if (end + 6 >= len) {
494 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
495 end - 5);
496 }
497 if (buf[next++] != '\\' || buf[next++] != 'u') {
498 raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr,
499 end - 5);
500 }
501 end += 6;
502 /* Decode 4 hex digits */
503 for (; next < end; next++) {
504 Py_ssize_t shl = (end - next - 1) << 2;
505 Py_UNICODE digit = buf[next];
506 switch (digit) {
507 case '0': case '1': case '2': case '3': case '4':
508 case '5': case '6': case '7': case '8': case '9':
509 c2 |= (digit - '0') << shl; break;
510 case 'a': case 'b': case 'c': case 'd': case 'e':
511 case 'f':
512 c2 |= (digit - 'a' + 10) << shl; break;
513 case 'A': case 'B': case 'C': case 'D': case 'E':
514 case 'F':
515 c2 |= (digit - 'A' + 10) << shl; break;
516 default:
517 raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5);
518 goto bail;
519 }
520 }
521 c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
522 }
523#endif
524 }
525 chunk = PyUnicode_FromUnicode(&c, 1);
526 if (chunk == NULL) {
527 goto bail;
528 }
529 if (PyList_Append(chunks, chunk)) {
530 goto bail;
531 }
532 Py_DECREF(chunk);
533 }
534
535 rval = join_list_unicode(chunks);
536 if (rval == NULL) {
537 goto bail;
538 }
539 Py_DECREF(chunks);
540 chunks = NULL;
541 return Py_BuildValue("(Nn)", rval, end);
542bail:
543 Py_XDECREF(chunks);
544 return NULL;
545}
546
547PyDoc_STRVAR(pydoc_scanstring,
548"scanstring(str_or_bytes, end, encoding) -> (bytes, end)\n");
549
550static PyObject *
551py_scanstring(PyObject* self, PyObject *args)
552{
553 PyObject *pystr;
554 Py_ssize_t end;
555 char *encoding = NULL;
556 int strict = 0;
557 if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) {
558 return NULL;
559 }
560 if (encoding == NULL) {
561 encoding = DEFAULT_ENCODING;
562 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000563 if (PyBytes_Check(pystr)) {
Christian Heimes90540002008-05-08 14:29:10 +0000564 return scanstring_str(pystr, end, encoding, strict);
565 }
566 else if (PyUnicode_Check(pystr)) {
567 return scanstring_unicode(pystr, end, strict);
568 }
569 else {
570 PyErr_Format(PyExc_TypeError,
571 "first argument must be a string or bytes, not %.80s",
572 Py_TYPE(pystr)->tp_name);
573 return NULL;
574 }
575}
576
577PyDoc_STRVAR(pydoc_encode_basestring_ascii,
578"encode_basestring_ascii(str_or_bytes) -> bytes\n");
579
580static PyObject *
581py_encode_basestring_ascii(PyObject* self, PyObject *pystr)
582{
583 PyObject *rval;
584 /* METH_O */
Christian Heimes72b710a2008-05-26 13:28:38 +0000585 if (PyBytes_Check(pystr)) {
Christian Heimes90540002008-05-08 14:29:10 +0000586 rval = ascii_escape_str(pystr);
587 }
588 else if (PyUnicode_Check(pystr)) {
589 rval = ascii_escape_unicode(pystr);
590 }
591 else {
592 PyErr_Format(PyExc_TypeError,
593 "first argument must be a string or unicode, not %.80s",
594 Py_TYPE(pystr)->tp_name);
595 return NULL;
596 }
Christian Heimes72b710a2008-05-26 13:28:38 +0000597 if (PyBytes_Check(rval)) {
598 PyObject *urval = PyUnicode_DecodeASCII(PyBytes_AS_STRING(rval), PyBytes_GET_SIZE(rval), NULL);
Christian Heimes90540002008-05-08 14:29:10 +0000599 Py_DECREF(rval);
600 return urval;
601 }
602 return rval;
603}
604
605static PyMethodDef json_methods[] = {
606 {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii,
607 METH_O, pydoc_encode_basestring_ascii},
608 {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS,
609 pydoc_scanstring},
610 {NULL, NULL, 0, NULL}
611};
612
613PyDoc_STRVAR(module_doc,
614"json speedups\n");
615
Martin v. Löwis1a214512008-06-11 05:26:20 +0000616static struct PyModuleDef jsonmodule = {
617 PyModuleDef_HEAD_INIT,
618 "_json",
619 module_doc,
620 -1,
621 json_methods,
622 NULL,
623 NULL,
624 NULL,
625 NULL
626};
627
628PyObject*
629PyInit__json(void)
Christian Heimes90540002008-05-08 14:29:10 +0000630{
Martin v. Löwis1a214512008-06-11 05:26:20 +0000631 return PyModule_Create(&jsonmodule);
Christian Heimes90540002008-05-08 14:29:10 +0000632}