Issue #6784: Strings from Python 2 can now be unpickled as bytes objects. Initial patch by Merlijn van Deen. I've added a few unrelated docstring fixes in the patch while I was at it, which makes the documentation for pickle a bit more consistent.

commit: d05c9ff84501d93b13de40a9c7b0360c7d2ebada [log] [tgz]
author: Alexandre Vassalotti <alexandre@peadrop.com> Sat Dec 07 01:09:27 2013 -0800
committer: Alexandre Vassalotti <alexandre@peadrop.com> Sat Dec 07 01:09:27 2013 -0800
tree: ae840ca5e91d21e53cc60e6c3e7fdd64b5a9fec4
parent: ee07b94788e5e3e79f6632e92a5295adc3937bf4 [diff]
diff --git a/Lib/pickle.py b/Lib/pickle.py
index c57149a..9cd0132 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py

@@ -348,24 +348,25 @@
     def __init__(self, file, protocol=None, *, fix_imports=True):
         """This takes a binary file for writing a pickle data stream.
 
-        The optional protocol argument tells the pickler to use the
+        The optional *protocol* argument tells the pickler to use the
         given protocol; supported protocols are 0, 1, 2, 3 and 4.  The
-        default protocol is 3; a backward-incompatible protocol designed for
-        Python 3.
+        default protocol is 3; a backward-incompatible protocol designed
+        for Python 3.
 
         Specifying a negative protocol version selects the highest
         protocol version supported.  The higher the protocol used, the
         more recent the version of Python needed to read the pickle
         produced.
 
-        The file argument must have a write() method that accepts a single
-        bytes argument. It can thus be a file object opened for binary
-        writing, a io.BytesIO instance, or any other custom object that
-        meets this interface.
+        The *file* argument must have a write() method that accepts a
+        single bytes argument. It can thus be a file object opened for
+        binary writing, a io.BytesIO instance, or any other custom
+        object that meets this interface.
 
-        If fix_imports is True and protocol is less than 3, pickle will try to
-        map the new Python 3 names to the old module names used in Python 2,
-        so that the pickle data stream is readable with Python 2.
+        If *fix_imports* is True and *protocol* is less than 3, pickle
+        will try to map the new Python 3 names to the old module names
+        used in Python 2, so that the pickle data stream is readable
+        with Python 2.
         """
         if protocol is None:
             protocol = DEFAULT_PROTOCOL
@@ -389,10 +390,9 @@
         """Clears the pickler's "memo".
 
         The memo is the data structure that remembers which objects the
-        pickler has already seen, so that shared or recursive objects are
-        pickled by reference and not by value.  This method is useful when
-        re-using picklers.
-
+        pickler has already seen, so that shared or recursive objects
+        are pickled by reference and not by value.  This method is
+        useful when re-using picklers.
         """
         self.memo.clear()
 
@@ -975,8 +975,14 @@
                  encoding="ASCII", errors="strict"):
         """This takes a binary file for reading a pickle data stream.
 
-        The protocol version of the pickle is detected automatically, so no
-        proto argument is needed.
+        The protocol version of the pickle is detected automatically, so
+        no proto argument is needed.
+
+        The argument *file* must have two methods, a read() method that
+        takes an integer argument, and a readline() method that requires
+        no arguments.  Both methods should return bytes.  Thus *file*
+        can be a binary file object opened for reading, a io.BytesIO
+        object, or any other custom object that meets this interface.
 
         The file-like object must have two methods, a read() method
         that takes an integer argument, and a readline() method that
@@ -985,13 +991,14 @@
         reading, a BytesIO object, or any other custom object that
         meets this interface.
 
-        Optional keyword arguments are *fix_imports*, *encoding* and *errors*,
-        which are used to control compatiblity support for pickle stream
-        generated by Python 2.x.  If *fix_imports* is True, pickle will try to
-        map the old Python 2.x names to the new names used in Python 3.x.  The
-        *encoding* and *errors* tell pickle how to decode 8-bit string
-        instances pickled by Python 2.x; these default to 'ASCII' and
-        'strict', respectively.
+        Optional keyword arguments are *fix_imports*, *encoding* and
+        *errors*, which are used to control compatiblity support for
+        pickle stream generated by Python 2.  If *fix_imports* is True,
+        pickle will try to map the old Python 2 names to the new names
+        used in Python 3.  The *encoding* and *errors* tell pickle how
+        to decode 8-bit string instances pickled by Python 2; these
+        default to 'ASCII' and 'strict', respectively. *encoding* can be
+        'bytes' to read theses 8-bit string instances as bytes objects.
         """
         self._file_readline = file.readline
         self._file_read = file.read
@@ -1139,6 +1146,15 @@
         self.append(unpack('>d', self.read(8))[0])
     dispatch[BINFLOAT[0]] = load_binfloat
 
+    def _decode_string(self, value):
+        # Used to allow strings from Python 2 to be decoded either as
+        # bytes or Unicode strings.  This should be used only with the
+        # STRING, BINSTRING and SHORT_BINSTRING opcodes.
+        if self.encoding == "bytes":
+            return value
+        else:
+            return value.decode(self.encoding, self.errors)
+
     def load_string(self):
         data = self.readline()[:-1]
         # Strip outermost quotes
@@ -1146,8 +1162,7 @@
             data = data[1:-1]
         else:
             raise UnpicklingError("the STRING opcode argument must be quoted")
-        self.append(codecs.escape_decode(data)[0]
-                    .decode(self.encoding, self.errors))
+        self.append(self._decode_string(codecs.escape_decode(data)[0]))
     dispatch[STRING[0]] = load_string
 
     def load_binstring(self):
@@ -1156,8 +1171,7 @@
         if len < 0:
             raise UnpicklingError("BINSTRING pickle has negative byte count")
         data = self.read(len)
-        value = str(data, self.encoding, self.errors)
-        self.append(value)
+        self.append(self._decode_string(data))
     dispatch[BINSTRING[0]] = load_binstring
 
     def load_binbytes(self):
@@ -1191,8 +1205,7 @@
     def load_short_binstring(self):
         len = self.read(1)[0]
         data = self.read(len)
-        value = str(data, self.encoding, self.errors)
-        self.append(value)
+        self.append(self._decode_string(data))
     dispatch[SHORT_BINSTRING[0]] = load_short_binstring
 
     def load_short_binbytes(self):

diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index a2480f6..71c2aa1 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py

@@ -969,113 +969,107 @@
         return self.name
 
 
-pyint = StackObject(
-            name='int',
-            obtype=int,
-            doc="A short (as opposed to long) Python integer object.")
-
-pylong = StackObject(
-             name='long',
-             obtype=int,
-             doc="A long (as opposed to short) Python integer object.")
+pyint = pylong = StackObject(
+    name='int',
+    obtype=int,
+    doc="A Python integer object.")
 
 pyinteger_or_bool = StackObject(
-                        name='int_or_bool',
-                        obtype=(int, bool),
-                        doc="A Python integer object (short or long), or "
-                            "a Python bool.")
+    name='int_or_bool',
+    obtype=(int, bool),
+    doc="A Python integer or boolean object.")
 
 pybool = StackObject(
-             name='bool',
-             obtype=(bool,),
-             doc="A Python bool object.")
+    name='bool',
+    obtype=bool,
+    doc="A Python boolean object.")
 
 pyfloat = StackObject(
-              name='float',
-              obtype=float,
-              doc="A Python float object.")
+    name='float',
+    obtype=float,
+    doc="A Python float object.")
 
-pystring = StackObject(
-               name='string',
-               obtype=bytes,
-               doc="A Python (8-bit) string object.")
+pybytes_or_str = pystring = StackObject(
+    name='bytes_or_str',
+    obtype=(bytes, str),
+    doc="A Python bytes or (Unicode) string object.")
 
 pybytes = StackObject(
-               name='bytes',
-               obtype=bytes,
-               doc="A Python bytes object.")
+    name='bytes',
+    obtype=bytes,
+    doc="A Python bytes object.")
 
 pyunicode = StackObject(
-                name='str',
-                obtype=str,
-                doc="A Python (Unicode) string object.")
+    name='str',
+    obtype=str,
+    doc="A Python (Unicode) string object.")
 
 pynone = StackObject(
-             name="None",
-             obtype=type(None),
-             doc="The Python None object.")
+    name="None",
+    obtype=type(None),
+    doc="The Python None object.")
 
 pytuple = StackObject(
-              name="tuple",
-              obtype=tuple,
-              doc="A Python tuple object.")
+    name="tuple",
+    obtype=tuple,
+    doc="A Python tuple object.")
 
 pylist = StackObject(
-             name="list",
-             obtype=list,
-             doc="A Python list object.")
+    name="list",
+    obtype=list,
+    doc="A Python list object.")
 
 pydict = StackObject(
-             name="dict",
-             obtype=dict,
-             doc="A Python dict object.")
+    name="dict",
+    obtype=dict,
+    doc="A Python dict object.")
 
 pyset = StackObject(
-            name="set",
-            obtype=set,
-            doc="A Python set object.")
+    name="set",
+    obtype=set,
+    doc="A Python set object.")
 
 pyfrozenset = StackObject(
-                  name="frozenset",
-                  obtype=set,
-                  doc="A Python frozenset object.")
+    name="frozenset",
+    obtype=set,
+    doc="A Python frozenset object.")
 
 anyobject = StackObject(
-                name='any',
-                obtype=object,
-                doc="Any kind of object whatsoever.")
+    name='any',
+    obtype=object,
+    doc="Any kind of object whatsoever.")
 
 markobject = StackObject(
-                 name="mark",
-                 obtype=StackObject,
-                 doc="""'The mark' is a unique object.
+    name="mark",
+    obtype=StackObject,
+    doc="""'The mark' is a unique object.
 
-                 Opcodes that operate on a variable number of objects
-                 generally don't embed the count of objects in the opcode,
-                 or pull it off the stack.  Instead the MARK opcode is used
-                 to push a special marker object on the stack, and then
-                 some other opcodes grab all the objects from the top of
-                 the stack down to (but not including) the topmost marker
-                 object.
-                 """)
+Opcodes that operate on a variable number of objects
+generally don't embed the count of objects in the opcode,
+or pull it off the stack.  Instead the MARK opcode is used
+to push a special marker object on the stack, and then
+some other opcodes grab all the objects from the top of
+the stack down to (but not including) the topmost marker
+object.
+""")
 
 stackslice = StackObject(
-                 name="stackslice",
-                 obtype=StackObject,
-                 doc="""An object representing a contiguous slice of the stack.
+    name="stackslice",
+    obtype=StackObject,
+    doc="""An object representing a contiguous slice of the stack.
 
-                 This is used in conjunction with markobject, to represent all
-                 of the stack following the topmost markobject.  For example,
-                 the POP_MARK opcode changes the stack from
+This is used in conjunction with markobject, to represent all
+of the stack following the topmost markobject.  For example,
+the POP_MARK opcode changes the stack from
 
-                     [..., markobject, stackslice]
-                 to
-                     [...]
+    [..., markobject, stackslice]
+to
+    [...]
 
-                 No matter how many object are on the stack after the topmost
-                 markobject, POP_MARK gets rid of all of them (including the
-                 topmost markobject too).
-                 """)
+No matter how many object are on the stack after the topmost
+markobject, POP_MARK gets rid of all of them (including the
+topmost markobject too).
+""")
 
 ##############################################################################
 # Descriptors for pickle opcodes.
@@ -1212,7 +1206,7 @@
       code='L',
       arg=decimalnl_long,
       stack_before=[],
-      stack_after=[pylong],
+      stack_after=[pyint],
       proto=0,
       doc="""Push a long integer.
 
@@ -1230,7 +1224,7 @@
       code='\x8a',
       arg=long1,
       stack_before=[],
-      stack_after=[pylong],
+      stack_after=[pyint],
       proto=2,
       doc="""Long integer using one-byte length.
 
@@ -1241,7 +1235,7 @@
       code='\x8b',
       arg=long4,
       stack_before=[],
-      stack_after=[pylong],
+      stack_after=[pyint],
       proto=2,
       doc="""Long integer using found-byte length.
 
@@ -1254,45 +1248,50 @@
       code='S',
       arg=stringnl,
       stack_before=[],
-      stack_after=[pystring],
+      stack_after=[pybytes_or_str],
       proto=0,
       doc="""Push a Python string object.
 
       The argument is a repr-style string, with bracketing quote characters,
       and perhaps embedded escapes.  The argument extends until the next
-      newline character.  (Actually, they are decoded into a str instance
+      newline character.  These are usually decoded into a str instance
       using the encoding given to the Unpickler constructor. or the default,
-      'ASCII'.)
+      'ASCII'.  If the encoding given was 'bytes' however, they will be
+      decoded as bytes object instead.
       """),
 
     I(name='BINSTRING',
       code='T',
       arg=string4,
       stack_before=[],
-      stack_after=[pystring],
+      stack_after=[pybytes_or_str],
       proto=1,
       doc="""Push a Python string object.
 
-      There are two arguments:  the first is a 4-byte little-endian signed int
-      giving the number of bytes in the string, and the second is that many
-      bytes, which are taken literally as the string content.  (Actually,
-      they are decoded into a str instance using the encoding given to the
-      Unpickler constructor. or the default, 'ASCII'.)
+      There are two arguments: the first is a 4-byte little-endian
+      signed int giving the number of bytes in the string, and the
+      second is that many bytes, which are taken literally as the string
+      content.  These are usually decoded into a str instance using the
+      encoding given to the Unpickler constructor. or the default,
+      'ASCII'.  If the encoding given was 'bytes' however, they will be
+      decoded as bytes object instead.
       """),
 
     I(name='SHORT_BINSTRING',
       code='U',
       arg=string1,
       stack_before=[],
-      stack_after=[pystring],
+      stack_after=[pybytes_or_str],
       proto=1,
       doc="""Push a Python string object.
 
-      There are two arguments:  the first is a 1-byte unsigned int giving
-      the number of bytes in the string, and the second is that many bytes,
-      which are taken literally as the string content.  (Actually, they
-      are decoded into a str instance using the encoding given to the
-      Unpickler constructor. or the default, 'ASCII'.)
+      There are two arguments: the first is a 1-byte unsigned int giving
+      the number of bytes in the string, and the second is that many
+      bytes, which are taken literally as the string content.  These are
+      usually decoded into a str instance using the encoding given to
+      the Unpickler constructor. or the default, 'ASCII'.  If the
+      encoding given was 'bytes' however, they will be decoded as bytes
+      object instead.
       """),
 
     # Bytes (protocol 3 only; older protocols don't support bytes at all)

diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 040c26f..05befbf 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py

@@ -1305,6 +1305,35 @@
         dumped = self.dumps(set([3]), 2)
         self.assertEqual(dumped, DATA6)
 
+    def test_load_python2_str_as_bytes(self):
+        # From Python 2: pickle.dumps('a\x00\xa0', protocol=0)
+        self.assertEqual(self.loads(b"S'a\\x00\\xa0'\n.",
+                                    encoding="bytes"), b'a\x00\xa0')
+        # From Python 2: pickle.dumps('a\x00\xa0', protocol=1)
+        self.assertEqual(self.loads(b'U\x03a\x00\xa0.',
+                                    encoding="bytes"), b'a\x00\xa0')
+        # From Python 2: pickle.dumps('a\x00\xa0', protocol=2)
+        self.assertEqual(self.loads(b'\x80\x02U\x03a\x00\xa0.',
+                                    encoding="bytes"), b'a\x00\xa0')
+
+    def test_load_python2_unicode_as_str(self):
+        # From Python 2: pickle.dumps(u'π', protocol=0)
+        self.assertEqual(self.loads(b'V\\u03c0\n.',
+                                    encoding='bytes'), 'π')
+        # From Python 2: pickle.dumps(u'π', protocol=1)
+        self.assertEqual(self.loads(b'X\x02\x00\x00\x00\xcf\x80.',
+                                    encoding="bytes"), 'π')
+        # From Python 2: pickle.dumps(u'π', protocol=2)
+        self.assertEqual(self.loads(b'\x80\x02X\x02\x00\x00\x00\xcf\x80.',
+                                    encoding="bytes"), 'π')
+
+    def test_load_long_python2_str_as_bytes(self):
+        # From Python 2: pickle.dumps('x' * 300, protocol=1)
+        self.assertEqual(self.loads(pickle.BINSTRING +
+                                    struct.pack("<I", 300) +
+                                    b'x' * 300 + pickle.STOP,
+                                    encoding='bytes'), b'x' * 300)
+
     def test_large_pickles(self):
         # Test the correctness of internal buffering routines when handling
         # large data.
@@ -1566,7 +1595,6 @@
                     unpickled = self.loads(self.dumps(method, proto))
                     self.assertEqual(method(obj), unpickled(obj))
 
-
     def test_c_methods(self):
         global Subclass
         class Subclass(tuple):

diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index fbe96ac..0b2fe1e 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py

@@ -83,13 +83,17 @@
 
 
 class PyDispatchTableTests(AbstractDispatchTableTests):
+
     pickler_class = pickle._Pickler
+
     def get_dispatch_table(self):
         return pickle.dispatch_table.copy()
 
 
 class PyChainDispatchTableTests(AbstractDispatchTableTests):
+
     pickler_class = pickle._Pickler
+
     def get_dispatch_table(self):
         return collections.ChainMap({}, pickle.dispatch_table)
commit	d05c9ff84501d93b13de40a9c7b0360c7d2ebada	[log] [tgz]
author	Alexandre Vassalotti <alexandre@peadrop.com>	Sat Dec 07 01:09:27 2013 -0800
committer	Alexandre Vassalotti <alexandre@peadrop.com>	Sat Dec 07 01:09:27 2013 -0800
tree	ae840ca5e91d21e53cc60e6c3e7fdd64b5a9fec4
parent	ee07b94788e5e3e79f6632e92a5295adc3937bf4 [diff]