Issue #5006: Better handling of unicode byte-order marks (BOM) in the io library.
This means, for example, that opening an UTF-16 text file in
append mode doesn't add a BOM at the end of the file if the file isn't
empty.
diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c
index 1bba13a..ba653d6 100644
--- a/Modules/_io/_iomodule.c
+++ b/Modules/_io/_iomodule.c
@@ -41,6 +41,7 @@
 PyObject *_PyIO_str_reset;
 PyObject *_PyIO_str_seek;
 PyObject *_PyIO_str_seekable;
+PyObject *_PyIO_str_setstate;
 PyObject *_PyIO_str_tell;
 PyObject *_PyIO_str_truncate;
 PyObject *_PyIO_str_writable;
@@ -48,6 +49,7 @@
 
 PyObject *_PyIO_empty_str;
 PyObject *_PyIO_empty_bytes;
+PyObject *_PyIO_zero;
 
 
 PyDoc_STRVAR(module_doc,
@@ -734,6 +736,8 @@
         goto fail;
     if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable")))
         goto fail;
+    if (!(_PyIO_str_setstate = PyUnicode_InternFromString("setstate")))
+        goto fail;
     if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell")))
         goto fail;
     if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate")))
@@ -747,6 +751,8 @@
         goto fail;
     if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0)))
         goto fail;
+    if (!(_PyIO_zero = PyLong_FromLong(0L)))
+        goto fail;
 
     state->initialized = 1;
 
diff --git a/Modules/_io/_iomodule.h b/Modules/_io/_iomodule.h
index a44f127..ef7248a 100644
--- a/Modules/_io/_iomodule.h
+++ b/Modules/_io/_iomodule.h
@@ -141,6 +141,7 @@
 extern PyObject *_PyIO_str_reset;
 extern PyObject *_PyIO_str_seek;
 extern PyObject *_PyIO_str_seekable;
+extern PyObject *_PyIO_str_setstate;
 extern PyObject *_PyIO_str_tell;
 extern PyObject *_PyIO_str_truncate;
 extern PyObject *_PyIO_str_writable;
@@ -148,3 +149,4 @@
 
 extern PyObject *_PyIO_empty_str;
 extern PyObject *_PyIO_empty_bytes;
+extern PyObject *_PyIO_zero;
diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c
index f201ba7..8d2a686 100644
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@@ -647,6 +647,8 @@
     char telling;
     /* Specialized encoding func (see below) */
     encodefunc_t encodefunc;
+    /* Whether or not it's the start of the stream */
+    char encoding_start_of_stream;
 
     /* Reads and writes are internally buffered in order to speed things up.
        However, any read will first flush the write buffer if itsn't empty.
@@ -707,21 +709,50 @@
 static PyObject *
 utf16_encode(PyTextIOWrapperObject *self, PyObject *text)
 {
-    PyObject *res;
-    res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
-                                PyUnicode_GET_SIZE(text),
-                                PyBytes_AS_STRING(self->errors), 0);
-    if (res == NULL)
-        return NULL;
-    /* Next writes will skip the BOM and use native byte ordering */
+    if (!self->encoding_start_of_stream) {
+        /* Skip the BOM and use native byte ordering */
 #if defined(WORDS_BIGENDIAN)
-    self->encodefunc = (encodefunc_t) utf16be_encode;
+        return utf16be_encode(self, text);
 #else
-    self->encodefunc = (encodefunc_t) utf16le_encode;
+        return utf16le_encode(self, text);
 #endif
-    return res;
+    }
+    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), 0);
 }
 
+static PyObject *
+utf32be_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), 1);
+}
+
+static PyObject *
+utf32le_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), -1);
+}
+
+static PyObject *
+utf32_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    if (!self->encoding_start_of_stream) {
+        /* Skip the BOM and use native byte ordering */
+#if defined(WORDS_BIGENDIAN)
+        return utf32be_encode(self, text);
+#else
+        return utf32le_encode(self, text);
+#endif
+    }
+    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), 0);
+}
 
 static PyObject *
 utf8_encode(PyTextIOWrapperObject *self, PyObject *text)
@@ -749,10 +780,13 @@
 static encodefuncentry encodefuncs[] = {
     {"ascii",       (encodefunc_t) ascii_encode},
     {"iso8859-1",   (encodefunc_t) latin1_encode},
+    {"utf-8",       (encodefunc_t) utf8_encode},
     {"utf-16-be",   (encodefunc_t) utf16be_encode},
     {"utf-16-le",   (encodefunc_t) utf16le_encode},
     {"utf-16",      (encodefunc_t) utf16_encode},
-    {"utf-8",       (encodefunc_t) utf8_encode},
+    {"utf-32-be",   (encodefunc_t) utf32be_encode},
+    {"utf-32-le",   (encodefunc_t) utf32le_encode},
+    {"utf-32",      (encodefunc_t) utf32_encode},
     {NULL, NULL}
 };
 
@@ -978,6 +1012,33 @@
     self->seekable = self->telling = PyObject_IsTrue(res);
     Py_DECREF(res);
 
+    self->encoding_start_of_stream = 0;
+    if (self->seekable && self->encoder) {
+        PyObject *cookieObj;
+        int cmp;
+
+        self->encoding_start_of_stream = 1;
+
+        cookieObj = PyObject_CallMethodObjArgs(buffer, _PyIO_str_tell, NULL);
+        if (cookieObj == NULL)
+            goto error;
+
+        cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
+        Py_DECREF(cookieObj);
+        if (cmp < 0) {
+            goto error;
+        }
+
+        if (cmp == 0) {
+            self->encoding_start_of_stream = 0;
+            res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
+                                             _PyIO_zero, NULL);
+            if (res == NULL)
+                goto error;
+            Py_DECREF(res);
+        }
+    }
+
     self->ok = 1;
     return 0;
 
@@ -1192,8 +1253,10 @@
         needflush = 1;
 
     /* XXX What if we were just reading? */
-    if (self->encodefunc != NULL)
+    if (self->encodefunc != NULL) {
         b = (*self->encodefunc)((PyObject *) self, text);
+        self->encoding_start_of_stream = 0;
+    }
     else
         b = PyObject_CallMethodObjArgs(self->encoder,
                                        _PyIO_str_encode, text, NULL);
@@ -1847,24 +1910,38 @@
     return 0;
 }
 
+static int
+_TextIOWrapper_encoder_setstate(PyTextIOWrapperObject *self,
+                                CookieStruct *cookie)
+{
+    PyObject *res;
+    /* Same as _TextIOWrapper_decoder_setstate() above. */
+    if (cookie->start_pos == 0 && cookie->dec_flags == 0) {
+        res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_reset, NULL);
+        self->encoding_start_of_stream = 1;
+    }
+    else {
+        res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
+                                         _PyIO_zero, NULL);
+        self->encoding_start_of_stream = 0;
+    }
+    if (res == NULL)
+        return -1;
+    Py_DECREF(res);
+    return 0;
+}
+
 static PyObject *
 TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
 {
     PyObject *cookieObj, *posobj;
     CookieStruct cookie;
     int whence = 0;
-    static PyObject *zero = NULL;
     PyObject *res;
     int cmp;
 
     CHECK_INITIALIZED(self);
 
-    if (zero == NULL) {
-        zero = PyLong_FromLong(0L);
-        if (zero == NULL)
-            return NULL;
-    }
-
     if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence))
         return NULL;
     CHECK_CLOSED(self);
@@ -1879,7 +1956,7 @@
 
     if (whence == 1) {
         /* seek relative to current position */
-        cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
+        cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
         if (cmp < 0)
             goto fail;
 
@@ -1900,7 +1977,7 @@
     else if (whence == 2) {
         /* seek relative to end of file */
 
-        cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
+        cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
         if (cmp < 0)
             goto fail;
 
@@ -1934,7 +2011,7 @@
         goto fail;
     }
 
-    cmp = PyObject_RichCompareBool(cookieObj, zero, Py_LT);
+    cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_LT);
     if (cmp < 0)
         goto fail;
 
@@ -2013,6 +2090,11 @@
             goto fail;
     }
 
+    /* Finally, reset the encoder (merely useful for proper BOM handling) */
+    if (self->encoder) {
+        if (_TextIOWrapper_encoder_setstate(self, &cookie) < 0)
+            goto fail;
+    }
     return cookieObj;
   fail:
     Py_XDECREF(cookieObj);