Issue #5006: Better handling of unicode byte-order marks (BOM) in the io library. This means, for example, that opening an UTF-16 text file in append mode doesn't add a BOM at the end of the file if the file isn't empty.

commit: e450185b4ad645d4f72cbd4b2139d6a987edc84d [log] [tgz]
author: Antoine Pitrou <solipsis@pitrou.net> Thu May 14 18:55:55 2009 +0000
committer: Antoine Pitrou <solipsis@pitrou.net> Thu May 14 18:55:55 2009 +0000
tree: d588925c1710f0404f9ac61058a79a5b33382408
parent: b565577aa722d8b39aa42da0384f776680c03c36 [diff]
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
index e3e7c3d..c9a7c5e 100644
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py

@@ -1436,6 +1436,15 @@
         self._snapshot = None  # info for reconstructing decoder state
         self._seekable = self._telling = self.buffer.seekable()
 
+        if self._seekable and self.writable():
+            position = self.buffer.tell()
+            if position != 0:
+                try:
+                    self._get_encoder().setstate(0)
+                except LookupError:
+                    # Sometimes the encoder doesn't exist
+                    pass
+
     # self._snapshot is either None, or a tuple (dec_flags, next_input)
     # where dec_flags is the second (integer) item of the decoder state
     # and next_input is the chunk of input bytes that comes next after the
@@ -1741,6 +1750,17 @@
                 raise IOError("can't restore logical file position")
             self._decoded_chars_used = chars_to_skip
 
+        # Finally, reset the encoder (merely useful for proper BOM handling)
+        try:
+            encoder = self._encoder or self._get_encoder()
+        except LookupError:
+            # Sometimes the encoder doesn't exist
+            pass
+        else:
+            if cookie != 0:
+                encoder.setstate(0)
+            else:
+                encoder.reset()
         return cookie
 
     def read(self, n=None):

diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index 1a525dc..98dc711 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py

@@ -1963,6 +1963,37 @@
 
         self.assertEqual(buffer.seekable(), txt.seekable())
 
+    def test_append_bom(self):
+        # The BOM is not written again when appending to a non-empty file
+        filename = support.TESTFN
+        for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
+            with self.open(filename, 'w', encoding=charset) as f:
+                f.write('aaa')
+                pos = f.tell()
+            with self.open(filename, 'rb') as f:
+                self.assertEquals(f.read(), 'aaa'.encode(charset))
+
+            with self.open(filename, 'a', encoding=charset) as f:
+                f.write('xxx')
+            with self.open(filename, 'rb') as f:
+                self.assertEquals(f.read(), 'aaaxxx'.encode(charset))
+
+    def test_seek_bom(self):
+        # Same test, but when seeking manually
+        filename = support.TESTFN
+        for charset in ('utf-8-sig', 'utf-16', 'utf-32'):
+            with self.open(filename, 'w', encoding=charset) as f:
+                f.write('aaa')
+                pos = f.tell()
+            with self.open(filename, 'r+', encoding=charset) as f:
+                f.seek(pos)
+                f.write('zzz')
+                f.seek(0)
+                f.write('bbb')
+            with self.open(filename, 'rb') as f:
+                self.assertEquals(f.read(), 'bbbzzz'.encode(charset))
+
+
 class CTextIOWrapperTest(TextIOWrapperTest):
 
     def test_initialization(self):

diff --git a/Misc/NEWS b/Misc/NEWS
index 8cb1e38..d8ba4c5 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS

@@ -23,6 +23,11 @@
 Library
 -------
 
+- Issue #5006: Better handling of unicode byte-order marks (BOM) in the io
+  library. This means, for example, that opening an UTF-16 text file in
+  append mode doesn't add a BOM at the end of the file if the file isn't
+  empty.
+
 - Issue #4050: inspect.findsource/getsource now raise an IOError if the 'source'
   file is a binary.  Patch by Brodie Rao, tests by Daniel Diniz.  This fix
   corrects a pydoc regression.

diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c
index 1bba13a..ba653d6 100644
--- a/Modules/_io/_iomodule.c
+++ b/Modules/_io/_iomodule.c

@@ -41,6 +41,7 @@
 PyObject *_PyIO_str_reset;
 PyObject *_PyIO_str_seek;
 PyObject *_PyIO_str_seekable;
+PyObject *_PyIO_str_setstate;
 PyObject *_PyIO_str_tell;
 PyObject *_PyIO_str_truncate;
 PyObject *_PyIO_str_writable;
@@ -48,6 +49,7 @@
 
 PyObject *_PyIO_empty_str;
 PyObject *_PyIO_empty_bytes;
+PyObject *_PyIO_zero;
 
 
 PyDoc_STRVAR(module_doc,
@@ -734,6 +736,8 @@
         goto fail;
     if (!(_PyIO_str_seekable = PyUnicode_InternFromString("seekable")))
         goto fail;
+    if (!(_PyIO_str_setstate = PyUnicode_InternFromString("setstate")))
+        goto fail;
     if (!(_PyIO_str_tell = PyUnicode_InternFromString("tell")))
         goto fail;
     if (!(_PyIO_str_truncate = PyUnicode_InternFromString("truncate")))
@@ -747,6 +751,8 @@
         goto fail;
     if (!(_PyIO_empty_bytes = PyBytes_FromStringAndSize(NULL, 0)))
         goto fail;
+    if (!(_PyIO_zero = PyLong_FromLong(0L)))
+        goto fail;
 
     state->initialized = 1;
 

diff --git a/Modules/_io/_iomodule.h b/Modules/_io/_iomodule.h
index a44f127..ef7248a 100644
--- a/Modules/_io/_iomodule.h
+++ b/Modules/_io/_iomodule.h

@@ -141,6 +141,7 @@
 extern PyObject *_PyIO_str_reset;
 extern PyObject *_PyIO_str_seek;
 extern PyObject *_PyIO_str_seekable;
+extern PyObject *_PyIO_str_setstate;
 extern PyObject *_PyIO_str_tell;
 extern PyObject *_PyIO_str_truncate;
 extern PyObject *_PyIO_str_writable;
@@ -148,3 +149,4 @@
 
 extern PyObject *_PyIO_empty_str;
 extern PyObject *_PyIO_empty_bytes;
+extern PyObject *_PyIO_zero;

diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c
index f201ba7..8d2a686 100644
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c

@@ -647,6 +647,8 @@
     char telling;
     /* Specialized encoding func (see below) */
     encodefunc_t encodefunc;
+    /* Whether or not it's the start of the stream */
+    char encoding_start_of_stream;
 
     /* Reads and writes are internally buffered in order to speed things up.
        However, any read will first flush the write buffer if itsn't empty.
@@ -707,21 +709,50 @@
 static PyObject *
 utf16_encode(PyTextIOWrapperObject *self, PyObject *text)
 {
-    PyObject *res;
-    res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
-                                PyUnicode_GET_SIZE(text),
-                                PyBytes_AS_STRING(self->errors), 0);
-    if (res == NULL)
-        return NULL;
-    /* Next writes will skip the BOM and use native byte ordering */
+    if (!self->encoding_start_of_stream) {
+        /* Skip the BOM and use native byte ordering */
 #if defined(WORDS_BIGENDIAN)
-    self->encodefunc = (encodefunc_t) utf16be_encode;
+        return utf16be_encode(self, text);
 #else
-    self->encodefunc = (encodefunc_t) utf16le_encode;
+        return utf16le_encode(self, text);
 #endif
-    return res;
+    }
+    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), 0);
 }
 
+static PyObject *
+utf32be_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), 1);
+}
+
+static PyObject *
+utf32le_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), -1);
+}
+
+static PyObject *
+utf32_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    if (!self->encoding_start_of_stream) {
+        /* Skip the BOM and use native byte ordering */
+#if defined(WORDS_BIGENDIAN)
+        return utf32be_encode(self, text);
+#else
+        return utf32le_encode(self, text);
+#endif
+    }
+    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), 0);
+}
 
 static PyObject *
 utf8_encode(PyTextIOWrapperObject *self, PyObject *text)
@@ -749,10 +780,13 @@
 static encodefuncentry encodefuncs[] = {
     {"ascii",       (encodefunc_t) ascii_encode},
     {"iso8859-1",   (encodefunc_t) latin1_encode},
+    {"utf-8",       (encodefunc_t) utf8_encode},
     {"utf-16-be",   (encodefunc_t) utf16be_encode},
     {"utf-16-le",   (encodefunc_t) utf16le_encode},
     {"utf-16",      (encodefunc_t) utf16_encode},
-    {"utf-8",       (encodefunc_t) utf8_encode},
+    {"utf-32-be",   (encodefunc_t) utf32be_encode},
+    {"utf-32-le",   (encodefunc_t) utf32le_encode},
+    {"utf-32",      (encodefunc_t) utf32_encode},
     {NULL, NULL}
 };
 
@@ -978,6 +1012,33 @@
     self->seekable = self->telling = PyObject_IsTrue(res);
     Py_DECREF(res);
 
+    self->encoding_start_of_stream = 0;
+    if (self->seekable && self->encoder) {
+        PyObject *cookieObj;
+        int cmp;
+
+        self->encoding_start_of_stream = 1;
+
+        cookieObj = PyObject_CallMethodObjArgs(buffer, _PyIO_str_tell, NULL);
+        if (cookieObj == NULL)
+            goto error;
+
+        cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
+        Py_DECREF(cookieObj);
+        if (cmp < 0) {
+            goto error;
+        }
+
+        if (cmp == 0) {
+            self->encoding_start_of_stream = 0;
+            res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
+                                             _PyIO_zero, NULL);
+            if (res == NULL)
+                goto error;
+            Py_DECREF(res);
+        }
+    }
+
     self->ok = 1;
     return 0;
 
@@ -1192,8 +1253,10 @@
         needflush = 1;
 
     /* XXX What if we were just reading? */
-    if (self->encodefunc != NULL)
+    if (self->encodefunc != NULL) {
         b = (*self->encodefunc)((PyObject *) self, text);
+        self->encoding_start_of_stream = 0;
+    }
     else
         b = PyObject_CallMethodObjArgs(self->encoder,
                                        _PyIO_str_encode, text, NULL);
@@ -1847,24 +1910,38 @@
     return 0;
 }
 
+static int
+_TextIOWrapper_encoder_setstate(PyTextIOWrapperObject *self,
+                                CookieStruct *cookie)
+{
+    PyObject *res;
+    /* Same as _TextIOWrapper_decoder_setstate() above. */
+    if (cookie->start_pos == 0 && cookie->dec_flags == 0) {
+        res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_reset, NULL);
+        self->encoding_start_of_stream = 1;
+    }
+    else {
+        res = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_setstate,
+                                         _PyIO_zero, NULL);
+        self->encoding_start_of_stream = 0;
+    }
+    if (res == NULL)
+        return -1;
+    Py_DECREF(res);
+    return 0;
+}
+
 static PyObject *
 TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
 {
     PyObject *cookieObj, *posobj;
     CookieStruct cookie;
     int whence = 0;
-    static PyObject *zero = NULL;
     PyObject *res;
     int cmp;
 
     CHECK_INITIALIZED(self);
 
-    if (zero == NULL) {
-        zero = PyLong_FromLong(0L);
-        if (zero == NULL)
-            return NULL;
-    }
-
     if (!PyArg_ParseTuple(args, "O|i:seek", &cookieObj, &whence))
         return NULL;
     CHECK_CLOSED(self);
@@ -1879,7 +1956,7 @@
 
     if (whence == 1) {
         /* seek relative to current position */
-        cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
+        cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
         if (cmp < 0)
             goto fail;
 
@@ -1900,7 +1977,7 @@
     else if (whence == 2) {
         /* seek relative to end of file */
 
-        cmp = PyObject_RichCompareBool(cookieObj, zero, Py_EQ);
+        cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_EQ);
         if (cmp < 0)
             goto fail;
 
@@ -1934,7 +2011,7 @@
         goto fail;
     }
 
-    cmp = PyObject_RichCompareBool(cookieObj, zero, Py_LT);
+    cmp = PyObject_RichCompareBool(cookieObj, _PyIO_zero, Py_LT);
     if (cmp < 0)
         goto fail;
 
@@ -2013,6 +2090,11 @@
             goto fail;
     }
 
+    /* Finally, reset the encoder (merely useful for proper BOM handling) */
+    if (self->encoder) {
+        if (_TextIOWrapper_encoder_setstate(self, &cookie) < 0)
+            goto fail;
+    }
     return cookieObj;
   fail:
     Py_XDECREF(cookieObj);
commit	e450185b4ad645d4f72cbd4b2139d6a987edc84d	[log] [tgz]
author	Antoine Pitrou <solipsis@pitrou.net>	Thu May 14 18:55:55 2009 +0000
committer	Antoine Pitrou <solipsis@pitrou.net>	Thu May 14 18:55:55 2009 +0000
tree	d588925c1710f0404f9ac61058a79a5b33382408
parent	b565577aa722d8b39aa42da0384f776680c03c36 [diff]