bpo-33578: Add getstate/setstate for CJK codec (GH-6984)



This implements getstate and setstate for the cjkcodecs multibyte incremental encoders/decoders, primarily to fix issues with seek/tell.

The encoder getstate/setstate is slightly tricky as the "state" is pending bytes + MultibyteCodec_State but only an integer can be returned. The approach I've taken is to encode this data into a long, similar to how .tell() encodes a "cookie_type" as a long.


https://bugs.python.org/issue33578
diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c
index 22172b0..4633499 100644
--- a/Modules/cjkcodecs/multibytecodec.c
+++ b/Modules/cjkcodecs/multibytecodec.c
@@ -896,6 +896,93 @@
 }
 
 /*[clinic input]
+_multibytecodec.MultibyteIncrementalEncoder.getstate
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self)
+/*[clinic end generated code: output=9794a5ace70d7048 input=4a2a82874ffa40bb]*/
+{
+    /* state made up of 1 byte for buffer size, up to MAXENCPENDING*4 bytes
+       for UTF-8 encoded buffer (each character can use up to 4
+       bytes), and required bytes for MultibyteCodec_State.c. A byte
+       array is used to avoid different compilers generating different
+       values for the same state, e.g. as a result of struct padding.
+    */
+    unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
+    Py_ssize_t statesize;
+    const char *pendingbuffer = NULL;
+    Py_ssize_t pendingsize;
+
+    if (self->pending != NULL) {
+        pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize);
+        if (pendingbuffer == NULL) {
+            return NULL;
+        }
+        if (pendingsize > MAXENCPENDING*4) {
+            PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
+            return NULL;
+        }
+        statebytes[0] = pendingsize;
+        memcpy(statebytes+1, pendingbuffer, pendingsize);
+        statesize = 1 + pendingsize;
+    } else {
+        statebytes[0] = 0;
+        statesize = 1;
+    }
+    memcpy(statebytes+statesize, self->state.c,
+           sizeof(self->state.c));
+    statesize += sizeof(self->state.c);
+
+    return (PyObject *)_PyLong_FromByteArray(statebytes, statesize,
+                                             1 /* little-endian */ ,
+                                             0 /* unsigned */ );
+}
+
+/*[clinic input]
+_multibytecodec.MultibyteIncrementalEncoder.setstate
+    state as statelong: object(type='PyLongObject *', subclass_of='&PyLong_Type')
+    /
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self,
+                                                          PyLongObject *statelong)
+/*[clinic end generated code: output=4e5e98ac1f4039ca input=c80fb5830d4d2f76]*/
+{
+    PyObject *pending = NULL;
+    unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
+
+    if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes),
+                            1 /* little-endian */ ,
+                            0 /* unsigned */ ) < 0) {
+        goto errorexit;
+    }
+
+    if (statebytes[0] > MAXENCPENDING*4) {
+        PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
+        return NULL;
+    }
+
+    pending = PyUnicode_DecodeUTF8((const char *)statebytes+1,
+                                   statebytes[0], "strict");
+    if (pending == NULL) {
+        goto errorexit;
+    }
+
+    Py_CLEAR(self->pending);
+    self->pending = pending;
+    memcpy(self->state.c, statebytes+1+statebytes[0],
+           sizeof(self->state.c));
+
+    Py_RETURN_NONE;
+
+errorexit:
+    Py_XDECREF(pending);
+    return NULL;
+}
+
+/*[clinic input]
 _multibytecodec.MultibyteIncrementalEncoder.reset
 [clinic start generated code]*/
 
@@ -919,6 +1006,8 @@
 
 static struct PyMethodDef mbiencoder_methods[] = {
     _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_ENCODE_METHODDEF
+    _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF
+    _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF
     _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_RESET_METHODDEF
     {NULL, NULL},
 };
@@ -984,6 +1073,7 @@
 {
     PyObject_GC_UnTrack(self);
     ERROR_DECREF(self->errors);
+    Py_CLEAR(self->pending);
     Py_TYPE(self)->tp_free(self);
 }
 
@@ -1120,6 +1210,68 @@
 }
 
 /*[clinic input]
+_multibytecodec.MultibyteIncrementalDecoder.getstate
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self)
+/*[clinic end generated code: output=255009c4713b7f82 input=4006aa49bddbaa75]*/
+{
+    PyObject *buffer;
+
+    buffer = PyBytes_FromStringAndSize((const char *)self->pending,
+                                       self->pendingsize);
+    if (buffer == NULL) {
+        return NULL;
+    }
+
+    return make_tuple(buffer, (Py_ssize_t)*self->state.c);
+}
+
+/*[clinic input]
+_multibytecodec.MultibyteIncrementalDecoder.setstate
+    state: object(subclass_of='&PyTuple_Type')
+    /
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self,
+                                                          PyObject *state)
+/*[clinic end generated code: output=106b2fbca3e2dcc2 input=e5d794e8baba1a47]*/
+{
+    PyObject *buffer;
+    Py_ssize_t buffersize;
+    char *bufferstr;
+    unsigned long long flag;
+
+    if (!PyArg_ParseTuple(state, "SK;setstate(): illegal state argument",
+                          &buffer, &flag))
+    {
+        return NULL;
+    }
+
+    buffersize = PyBytes_Size(buffer);
+    if (buffersize == -1) {
+        return NULL;
+    }
+
+    if (buffersize > MAXDECPENDING) {
+        PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
+        return NULL;
+    }
+
+    bufferstr = PyBytes_AsString(buffer);
+    if (bufferstr == NULL) {
+        return NULL;
+    }
+    self->pendingsize = buffersize;
+    memcpy(self->pending, bufferstr, self->pendingsize);
+    memcpy(self->state.c, (unsigned char *)&flag, sizeof(flag));
+
+    Py_RETURN_NONE;
+}
+
+/*[clinic input]
 _multibytecodec.MultibyteIncrementalDecoder.reset
 [clinic start generated code]*/
 
@@ -1137,6 +1289,8 @@
 
 static struct PyMethodDef mbidecoder_methods[] = {
     _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_DECODE_METHODDEF
+    _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF
+    _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF
     _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_RESET_METHODDEF
     {NULL, NULL},
 };