This patch changes the behaviour of the UTF-16 codec family. Only the
UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub-
sequent BOM characters are no longer interpreted and removed.
UTF-16-LE and -BE pass through all BOM mark characters.
These changes should get the UTF-16 codec more in line with what
the Unicode FAQ recommends w/r to BOM marks.
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 988ea1b..f91a5a0 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -459,10 +459,11 @@
*byteorder == 0: native order
*byteorder == 1: big endian
- and then switches according to all BOM marks it finds in the input
- data. BOM marks are not copied into the resulting Unicode string.
- After completion, *byteorder is set to the current byte order at
- the end of input data.
+ In native mode, the first two bytes of the stream are checked for a
+ BOM mark. If found, the BOM mark is analysed, the byte order
+ adjusted and the BOM skipped. In the other modes, no BOM mark
+ interpretation is done. After completion, *byteorder is set to the
+ current byte order at the end of input data.
If byteorder is NULL, the codec starts in native order mode.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 475215c..d55e2a7 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1001,31 +1001,39 @@
if (byteorder)
bo = *byteorder;
+ /* Check for BOM marks (U+FEFF) in the input and adjust current
+ byte order setting accordingly. In native mode, the leading BOM
+ mark is skipped, in all other modes, it is copied to the output
+ stream as-is (giving a ZWNBSP character). */
+ if (bo == 0) {
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ if (*q == 0xFEFF) {
+ q++;
+ bo = -1;
+ } else if (*q == 0xFFFE) {
+ q++;
+ bo = 1;
+ }
+#else
+ if (*q == 0xFEFF) {
+ q++;
+ bo = 1;
+ } else if (*q == 0xFFFE) {
+ q++;
+ bo = -1;
+ }
+#endif
+ }
+
while (q < e) {
register Py_UNICODE ch = *q++;
- /* Check for BOM marks (U+FEFF) in the input and adjust
- current byte order setting accordingly. Swap input
- bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
- !) */
+ /* Swap input bytes if needed. (This assumes
+ sizeof(Py_UNICODE) == 2 !) */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- if (ch == 0xFEFF) {
- bo = -1;
- continue;
- } else if (ch == 0xFFFE) {
- bo = 1;
- continue;
- }
if (bo == 1)
ch = (ch >> 8) | (ch << 8);
#else
- if (ch == 0xFEFF) {
- bo = 1;
- continue;
- } else if (ch == 0xFFFE) {
- bo = -1;
- continue;
- }
if (bo == -1)
ch = (ch >> 8) | (ch << 8);
#endif