| #include "Python.h" |
| |
| #define DEFAULT_ENCODING "utf-8" |
| #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"') |
| #define MIN_EXPANSION 6 |
| |
| #ifdef Py_UNICODE_WIDE |
| #define MAX_EXPANSION (2 * MIN_EXPANSION) |
| #else |
| #define MAX_EXPANSION MIN_EXPANSION |
| #endif |
| |
| static Py_ssize_t |
| ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) |
| { |
| Py_UNICODE x; |
| output[chars++] = '\\'; |
| switch (c) { |
| case '\\': output[chars++] = (char)c; break; |
| case '"': output[chars++] = (char)c; break; |
| case '\b': output[chars++] = 'b'; break; |
| case '\f': output[chars++] = 'f'; break; |
| case '\n': output[chars++] = 'n'; break; |
| case '\r': output[chars++] = 'r'; break; |
| case '\t': output[chars++] = 't'; break; |
| default: |
| #ifdef Py_UNICODE_WIDE |
| if (c >= 0x10000) { |
| /* UTF-16 surrogate pair */ |
| Py_UNICODE v = c - 0x10000; |
| c = 0xd800 | ((v >> 10) & 0x3ff); |
| output[chars++] = 'u'; |
| x = (c & 0xf000) >> 12; |
| output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); |
| x = (c & 0x0f00) >> 8; |
| output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); |
| x = (c & 0x00f0) >> 4; |
| output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); |
| x = (c & 0x000f); |
| output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); |
| c = 0xdc00 | (v & 0x3ff); |
| output[chars++] = '\\'; |
| } |
| #endif |
| output[chars++] = 'u'; |
| x = (c & 0xf000) >> 12; |
| output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); |
| x = (c & 0x0f00) >> 8; |
| output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); |
| x = (c & 0x00f0) >> 4; |
| output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); |
| x = (c & 0x000f); |
| output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); |
| } |
| return chars; |
| } |
| |
| static PyObject * |
| ascii_escape_unicode(PyObject *pystr) |
| { |
| Py_ssize_t i; |
| Py_ssize_t input_chars; |
| Py_ssize_t output_size; |
| Py_ssize_t chars; |
| PyObject *rval; |
| char *output; |
| Py_UNICODE *input_unicode; |
| |
| input_chars = PyUnicode_GET_SIZE(pystr); |
| input_unicode = PyUnicode_AS_UNICODE(pystr); |
| /* One char input can be up to 6 chars output, estimate 4 of these */ |
| output_size = 2 + (MIN_EXPANSION * 4) + input_chars; |
| rval = PyBytes_FromStringAndSize(NULL, output_size); |
| if (rval == NULL) { |
| return NULL; |
| } |
| output = PyBytes_AS_STRING(rval); |
| chars = 0; |
| output[chars++] = '"'; |
| for (i = 0; i < input_chars; i++) { |
| Py_UNICODE c = input_unicode[i]; |
| if (S_CHAR(c)) { |
| output[chars++] = (char)c; |
| } |
| else { |
| chars = ascii_escape_char(c, output, chars); |
| } |
| if (output_size - chars < (1 + MAX_EXPANSION)) { |
| /* There's more than four, so let's resize by a lot */ |
| output_size *= 2; |
| /* This is an upper bound */ |
| if (output_size > 2 + (input_chars * MAX_EXPANSION)) { |
| output_size = 2 + (input_chars * MAX_EXPANSION); |
| } |
| if (_PyBytes_Resize(&rval, output_size) == -1) { |
| return NULL; |
| } |
| output = PyBytes_AS_STRING(rval); |
| } |
| } |
| output[chars++] = '"'; |
| if (_PyBytes_Resize(&rval, chars) == -1) { |
| return NULL; |
| } |
| return rval; |
| } |
| |
| static PyObject * |
| ascii_escape_str(PyObject *pystr) |
| { |
| Py_ssize_t i; |
| Py_ssize_t input_chars; |
| Py_ssize_t output_size; |
| Py_ssize_t chars; |
| PyObject *rval; |
| char *output; |
| char *input_str; |
| |
| input_chars = PyBytes_GET_SIZE(pystr); |
| input_str = PyBytes_AS_STRING(pystr); |
| /* One char input can be up to 6 chars output, estimate 4 of these */ |
| output_size = 2 + (MIN_EXPANSION * 4) + input_chars; |
| rval = PyBytes_FromStringAndSize(NULL, output_size); |
| if (rval == NULL) { |
| return NULL; |
| } |
| output = PyBytes_AS_STRING(rval); |
| chars = 0; |
| output[chars++] = '"'; |
| for (i = 0; i < input_chars; i++) { |
| Py_UNICODE c = (Py_UNICODE)input_str[i]; |
| if (S_CHAR(c)) { |
| output[chars++] = (char)c; |
| } |
| else if (c > 0x7F) { |
| /* We hit a non-ASCII character, bail to unicode mode */ |
| PyObject *uni; |
| Py_DECREF(rval); |
| uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict"); |
| if (uni == NULL) { |
| return NULL; |
| } |
| rval = ascii_escape_unicode(uni); |
| Py_DECREF(uni); |
| return rval; |
| } |
| else { |
| chars = ascii_escape_char(c, output, chars); |
| } |
| /* An ASCII char can't possibly expand to a surrogate! */ |
| if (output_size - chars < (1 + MIN_EXPANSION)) { |
| /* There's more than four, so let's resize by a lot */ |
| output_size *= 2; |
| if (output_size > 2 + (input_chars * MIN_EXPANSION)) { |
| output_size = 2 + (input_chars * MIN_EXPANSION); |
| } |
| if (_PyBytes_Resize(&rval, output_size) == -1) { |
| return NULL; |
| } |
| output = PyBytes_AS_STRING(rval); |
| } |
| } |
| output[chars++] = '"'; |
| if (_PyBytes_Resize(&rval, chars) == -1) { |
| return NULL; |
| } |
| return rval; |
| } |
| |
| void |
| raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) |
| { |
| static PyObject *errmsg_fn = NULL; |
| PyObject *pymsg; |
| if (errmsg_fn == NULL) { |
| PyObject *decoder = PyImport_ImportModule("json.decoder"); |
| if (decoder == NULL) |
| return; |
| errmsg_fn = PyObject_GetAttrString(decoder, "errmsg"); |
| if (errmsg_fn == NULL) |
| return; |
| Py_DECREF(decoder); |
| } |
| pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end); |
| if (pymsg) { |
| PyErr_SetObject(PyExc_ValueError, pymsg); |
| Py_DECREF(pymsg); |
| } |
| /* |
| |
| def linecol(doc, pos): |
| lineno = doc.count('\n', 0, pos) + 1 |
| if lineno == 1: |
| colno = pos |
| else: |
| colno = pos - doc.rindex('\n', 0, pos) |
| return lineno, colno |
| |
| def errmsg(msg, doc, pos, end=None): |
| lineno, colno = linecol(doc, pos) |
| if end is None: |
| return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos) |
| endlineno, endcolno = linecol(doc, end) |
| return '%s: line %d column %d - line %d column %d (char %d - %d)' % ( |
| msg, lineno, colno, endlineno, endcolno, pos, end) |
| |
| */ |
| } |
| |
| static PyObject * |
| join_list_unicode(PyObject *lst) |
| { |
| static PyObject *ustr = NULL; |
| static PyObject *joinstr = NULL; |
| if (ustr == NULL) { |
| Py_UNICODE c = 0; |
| ustr = PyUnicode_FromUnicode(&c, 0); |
| } |
| if (joinstr == NULL) { |
| joinstr = PyUnicode_InternFromString("join"); |
| } |
| if (joinstr == NULL || ustr == NULL) { |
| return NULL; |
| } |
| return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL); |
| } |
| |
| static PyObject * |
| scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict) |
| { |
| PyObject *rval; |
| Py_ssize_t len = PyBytes_GET_SIZE(pystr); |
| Py_ssize_t begin = end - 1; |
| Py_ssize_t next = begin; |
| char *buf = PyBytes_AS_STRING(pystr); |
| Py_buffer info; |
| PyObject *chunks = PyList_New(0); |
| if (chunks == NULL) { |
| goto bail; |
| } |
| if (end < 0 || len <= end) { |
| PyErr_SetString(PyExc_ValueError, "end is out of bounds"); |
| goto bail; |
| } |
| while (1) { |
| /* Find the end of the string or the next escape */ |
| Py_UNICODE c = 0; |
| PyObject *chunk = NULL; |
| for (next = end; next < len; next++) { |
| c = buf[next]; |
| if (c == '"' || c == '\\') { |
| break; |
| } |
| else if (strict && c <= 0x1f) { |
| raise_errmsg("Invalid control character at", pystr, next); |
| goto bail; |
| } |
| } |
| if (!(c == '"' || c == '\\')) { |
| raise_errmsg("Unterminated string starting at", pystr, begin); |
| goto bail; |
| } |
| /* Pick up this chunk if it's not zero length */ |
| if (next != end) { |
| PyObject *strchunk; |
| if (PyBuffer_FillInfo(&info, NULL, &buf[end], next - end, 1, 0) < 0) { |
| goto bail; |
| } |
| strchunk = PyMemoryView_FromBuffer(&info); |
| if (strchunk == NULL) { |
| goto bail; |
| } |
| chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL); |
| Py_DECREF(strchunk); |
| if (chunk == NULL) { |
| goto bail; |
| } |
| if (PyList_Append(chunks, chunk)) { |
| Py_DECREF(chunk); |
| goto bail; |
| } |
| Py_DECREF(chunk); |
| } |
| next++; |
| if (c == '"') { |
| end = next; |
| break; |
| } |
| if (next == len) { |
| raise_errmsg("Unterminated string starting at", pystr, begin); |
| goto bail; |
| } |
| c = buf[next]; |
| if (c != 'u') { |
| /* Non-unicode backslash escapes */ |
| end = next + 1; |
| switch (c) { |
| case '"': break; |
| case '\\': break; |
| case '/': break; |
| case 'b': c = '\b'; break; |
| case 'f': c = '\f'; break; |
| case 'n': c = '\n'; break; |
| case 'r': c = '\r'; break; |
| case 't': c = '\t'; break; |
| default: c = 0; |
| } |
| if (c == 0) { |
| raise_errmsg("Invalid \\escape", pystr, end - 2); |
| goto bail; |
| } |
| } |
| else { |
| c = 0; |
| next++; |
| end = next + 4; |
| if (end >= len) { |
| raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); |
| goto bail; |
| } |
| /* Decode 4 hex digits */ |
| for (; next < end; next++) { |
| Py_ssize_t shl = (end - next - 1) << 2; |
| Py_UNICODE digit = buf[next]; |
| switch (digit) { |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| c |= (digit - '0') << shl; break; |
| case 'a': case 'b': case 'c': case 'd': case 'e': |
| case 'f': |
| c |= (digit - 'a' + 10) << shl; break; |
| case 'A': case 'B': case 'C': case 'D': case 'E': |
| case 'F': |
| c |= (digit - 'A' + 10) << shl; break; |
| default: |
| raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); |
| goto bail; |
| } |
| } |
| #ifdef Py_UNICODE_WIDE |
| /* Surrogate pair */ |
| if (c >= 0xd800 && c <= 0xdbff) { |
| Py_UNICODE c2 = 0; |
| if (end + 6 >= len) { |
| raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, |
| end - 5); |
| } |
| if (buf[next++] != '\\' || buf[next++] != 'u') { |
| raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, |
| end - 5); |
| } |
| end += 6; |
| /* Decode 4 hex digits */ |
| for (; next < end; next++) { |
| Py_ssize_t shl = (end - next - 1) << 2; |
| Py_UNICODE digit = buf[next]; |
| switch (digit) { |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| c2 |= (digit - '0') << shl; break; |
| case 'a': case 'b': case 'c': case 'd': case 'e': |
| case 'f': |
| c2 |= (digit - 'a' + 10) << shl; break; |
| case 'A': case 'B': case 'C': case 'D': case 'E': |
| case 'F': |
| c2 |= (digit - 'A' + 10) << shl; break; |
| default: |
| raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); |
| goto bail; |
| } |
| } |
| c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); |
| } |
| #endif |
| } |
| chunk = PyUnicode_FromUnicode(&c, 1); |
| if (chunk == NULL) { |
| goto bail; |
| } |
| if (PyList_Append(chunks, chunk)) { |
| Py_DECREF(chunk); |
| goto bail; |
| } |
| Py_DECREF(chunk); |
| } |
| |
| rval = join_list_unicode(chunks); |
| if (rval == NULL) { |
| goto bail; |
| } |
| Py_CLEAR(chunks); |
| return Py_BuildValue("(Nn)", rval, end); |
| bail: |
| Py_XDECREF(chunks); |
| return NULL; |
| } |
| |
| |
| static PyObject * |
| scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict) |
| { |
| PyObject *rval; |
| Py_ssize_t len = PyUnicode_GET_SIZE(pystr); |
| Py_ssize_t begin = end - 1; |
| Py_ssize_t next = begin; |
| const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); |
| PyObject *chunks = PyList_New(0); |
| if (chunks == NULL) { |
| goto bail; |
| } |
| if (end < 0 || len <= end) { |
| PyErr_SetString(PyExc_ValueError, "end is out of bounds"); |
| goto bail; |
| } |
| while (1) { |
| /* Find the end of the string or the next escape */ |
| Py_UNICODE c = 0; |
| PyObject *chunk = NULL; |
| for (next = end; next < len; next++) { |
| c = buf[next]; |
| if (c == '"' || c == '\\') { |
| break; |
| } |
| else if (strict && c <= 0x1f) { |
| raise_errmsg("Invalid control character at", pystr, next); |
| goto bail; |
| } |
| } |
| if (!(c == '"' || c == '\\')) { |
| raise_errmsg("Unterminated string starting at", pystr, begin); |
| goto bail; |
| } |
| /* Pick up this chunk if it's not zero length */ |
| if (next != end) { |
| chunk = PyUnicode_FromUnicode(&buf[end], next - end); |
| if (chunk == NULL) { |
| goto bail; |
| } |
| if (PyList_Append(chunks, chunk)) { |
| Py_DECREF(chunk); |
| goto bail; |
| } |
| Py_DECREF(chunk); |
| } |
| next++; |
| if (c == '"') { |
| end = next; |
| break; |
| } |
| if (next == len) { |
| raise_errmsg("Unterminated string starting at", pystr, begin); |
| goto bail; |
| } |
| c = buf[next]; |
| if (c != 'u') { |
| /* Non-unicode backslash escapes */ |
| end = next + 1; |
| switch (c) { |
| case '"': break; |
| case '\\': break; |
| case '/': break; |
| case 'b': c = '\b'; break; |
| case 'f': c = '\f'; break; |
| case 'n': c = '\n'; break; |
| case 'r': c = '\r'; break; |
| case 't': c = '\t'; break; |
| default: c = 0; |
| } |
| if (c == 0) { |
| raise_errmsg("Invalid \\escape", pystr, end - 2); |
| goto bail; |
| } |
| } |
| else { |
| c = 0; |
| next++; |
| end = next + 4; |
| if (end >= len) { |
| raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); |
| goto bail; |
| } |
| /* Decode 4 hex digits */ |
| for (; next < end; next++) { |
| Py_ssize_t shl = (end - next - 1) << 2; |
| Py_UNICODE digit = buf[next]; |
| switch (digit) { |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| c |= (digit - '0') << shl; break; |
| case 'a': case 'b': case 'c': case 'd': case 'e': |
| case 'f': |
| c |= (digit - 'a' + 10) << shl; break; |
| case 'A': case 'B': case 'C': case 'D': case 'E': |
| case 'F': |
| c |= (digit - 'A' + 10) << shl; break; |
| default: |
| raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); |
| goto bail; |
| } |
| } |
| #ifdef Py_UNICODE_WIDE |
| /* Surrogate pair */ |
| if (c >= 0xd800 && c <= 0xdbff) { |
| Py_UNICODE c2 = 0; |
| if (end + 6 >= len) { |
| raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, |
| end - 5); |
| } |
| if (buf[next++] != '\\' || buf[next++] != 'u') { |
| raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, |
| end - 5); |
| } |
| end += 6; |
| /* Decode 4 hex digits */ |
| for (; next < end; next++) { |
| Py_ssize_t shl = (end - next - 1) << 2; |
| Py_UNICODE digit = buf[next]; |
| switch (digit) { |
| case '0': case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| c2 |= (digit - '0') << shl; break; |
| case 'a': case 'b': case 'c': case 'd': case 'e': |
| case 'f': |
| c2 |= (digit - 'a' + 10) << shl; break; |
| case 'A': case 'B': case 'C': case 'D': case 'E': |
| case 'F': |
| c2 |= (digit - 'A' + 10) << shl; break; |
| default: |
| raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); |
| goto bail; |
| } |
| } |
| c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); |
| } |
| #endif |
| } |
| chunk = PyUnicode_FromUnicode(&c, 1); |
| if (chunk == NULL) { |
| goto bail; |
| } |
| if (PyList_Append(chunks, chunk)) { |
| Py_DECREF(chunk); |
| goto bail; |
| } |
| Py_DECREF(chunk); |
| } |
| |
| rval = join_list_unicode(chunks); |
| if (rval == NULL) { |
| goto bail; |
| } |
| Py_CLEAR(chunks); |
| return Py_BuildValue("(Nn)", rval, end); |
| bail: |
| Py_XDECREF(chunks); |
| return NULL; |
| } |
| |
| PyDoc_STRVAR(pydoc_scanstring, |
| "scanstring(str_or_bytes, end, encoding) -> (bytes, end)\n"); |
| |
| static PyObject * |
| py_scanstring(PyObject* self, PyObject *args) |
| { |
| PyObject *pystr; |
| Py_ssize_t end; |
| char *encoding = NULL; |
| int strict = 0; |
| if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) { |
| return NULL; |
| } |
| if (encoding == NULL) { |
| encoding = DEFAULT_ENCODING; |
| } |
| if (PyBytes_Check(pystr)) { |
| return scanstring_str(pystr, end, encoding, strict); |
| } |
| else if (PyUnicode_Check(pystr)) { |
| return scanstring_unicode(pystr, end, strict); |
| } |
| else { |
| PyErr_Format(PyExc_TypeError, |
| "first argument must be a string or bytes, not %.80s", |
| Py_TYPE(pystr)->tp_name); |
| return NULL; |
| } |
| } |
| |
| PyDoc_STRVAR(pydoc_encode_basestring_ascii, |
| "encode_basestring_ascii(str_or_bytes) -> bytes\n"); |
| |
| static PyObject * |
| py_encode_basestring_ascii(PyObject* self, PyObject *pystr) |
| { |
| PyObject *rval; |
| /* METH_O */ |
| if (PyBytes_Check(pystr)) { |
| rval = ascii_escape_str(pystr); |
| } |
| else if (PyUnicode_Check(pystr)) { |
| rval = ascii_escape_unicode(pystr); |
| } |
| else { |
| PyErr_Format(PyExc_TypeError, |
| "first argument must be a string or unicode, not %.80s", |
| Py_TYPE(pystr)->tp_name); |
| return NULL; |
| } |
| if (rval != NULL && PyBytes_Check(rval)) { |
| PyObject *urval = PyUnicode_DecodeASCII(PyBytes_AS_STRING(rval), PyBytes_GET_SIZE(rval), NULL); |
| Py_DECREF(rval); |
| return urval; |
| } |
| return rval; |
| } |
| |
| static PyMethodDef json_methods[] = { |
| {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii, |
| METH_O, pydoc_encode_basestring_ascii}, |
| {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS, |
| pydoc_scanstring}, |
| {NULL, NULL, 0, NULL} |
| }; |
| |
| PyDoc_STRVAR(module_doc, |
| "json speedups\n"); |
| |
| static struct PyModuleDef jsonmodule = { |
| PyModuleDef_HEAD_INIT, |
| "_json", |
| module_doc, |
| -1, |
| json_methods, |
| NULL, |
| NULL, |
| NULL, |
| NULL |
| }; |
| |
| PyObject* |
| PyInit__json(void) |
| { |
| return PyModule_Create(&jsonmodule); |
| } |