bpo-32072: Fix issues with binary plists. (#4455) * Fixed saving bytearrays. * Identical objects will be saved only once. * Equal references will be load as identical objects. * Added support for saving and loading recursive data structures.

commit: a897aeeef647259a938a36cb5eb6680c86021c6a [log] [tgz]
author: Serhiy Storchaka <storchaka@gmail.com> Thu Nov 30 23:26:11 2017 +0200
committer: GitHub <noreply@github.com> Thu Nov 30 23:26:11 2017 +0200
tree: 2659012d46b7222259d37a81a5f3d5fb330310eb
parent: b4d1e1f7c1af6ae33f0e371576c8bcafedb099db [diff] [blame]
diff --git a/Lib/plistlib.py b/Lib/plistlib.py
index 2113a2d..21ebec3 100644
--- a/Lib/plistlib.py
+++ b/Lib/plistlib.py

@@ -525,6 +525,8 @@
 
 _BINARY_FORMAT = {1: 'B', 2: 'H', 4: 'L', 8: 'Q'}
 
+_undefined = object()
+
 class _BinaryPlistParser:
     """
     Read or write a binary plist file, following the description of the binary
@@ -555,7 +557,8 @@
             ) = struct.unpack('>6xBBQQQ', trailer)
             self._fp.seek(offset_table_offset)
             self._object_offsets = self._read_ints(num_objects, offset_size)
-            return self._read_object(self._object_offsets[top_object])
+            self._objects = [_undefined] * num_objects
+            return self._read_object(top_object)
 
         except (OSError, IndexError, struct.error, OverflowError,
                 UnicodeDecodeError):
@@ -584,62 +587,68 @@
     def _read_refs(self, n):
         return self._read_ints(n, self._ref_size)
 
-    def _read_object(self, offset):
+    def _read_object(self, ref):
         """
-        read the object at offset.
+        read the object by reference.
 
         May recursively read sub-objects (content of an array/dict/set)
         """
+        result = self._objects[ref]
+        if result is not _undefined:
+            return result
+
+        offset = self._object_offsets[ref]
         self._fp.seek(offset)
         token = self._fp.read(1)[0]
         tokenH, tokenL = token & 0xF0, token & 0x0F
 
         if token == 0x00:
-            return None
+            result = None
 
         elif token == 0x08:
-            return False
+            result = False
 
         elif token == 0x09:
-            return True
+            result = True
 
         # The referenced source code also mentions URL (0x0c, 0x0d) and
         # UUID (0x0e), but neither can be generated using the Cocoa libraries.
 
         elif token == 0x0f:
-            return b''
+            result = b''
 
         elif tokenH == 0x10:  # int
-            return int.from_bytes(self._fp.read(1 << tokenL),
-                                  'big', signed=tokenL >= 3)
+            result = int.from_bytes(self._fp.read(1 << tokenL),
+                                    'big', signed=tokenL >= 3)
 
         elif token == 0x22: # real
-            return struct.unpack('>f', self._fp.read(4))[0]
+            result = struct.unpack('>f', self._fp.read(4))[0]
 
         elif token == 0x23: # real
-            return struct.unpack('>d', self._fp.read(8))[0]
+            result = struct.unpack('>d', self._fp.read(8))[0]
 
         elif token == 0x33:  # date
             f = struct.unpack('>d', self._fp.read(8))[0]
             # timestamp 0 of binary plists corresponds to 1/1/2001
             # (year of Mac OS X 10.0), instead of 1/1/1970.
-            return datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=f)
+            result = (datetime.datetime(2001, 1, 1) +
+                      datetime.timedelta(seconds=f))
 
         elif tokenH == 0x40:  # data
             s = self._get_size(tokenL)
             if self._use_builtin_types:
-                return self._fp.read(s)
+                result = self._fp.read(s)
             else:
-                return Data(self._fp.read(s))
+                result = Data(self._fp.read(s))
 
         elif tokenH == 0x50:  # ascii string
             s = self._get_size(tokenL)
             result =  self._fp.read(s).decode('ascii')
-            return result
+            result = result
 
         elif tokenH == 0x60:  # unicode string
             s = self._get_size(tokenL)
-            return self._fp.read(s * 2).decode('utf-16be')
+            result = self._fp.read(s * 2).decode('utf-16be')
 
         # tokenH == 0x80 is documented as 'UID' and appears to be used for
         # keyed-archiving, not in plists.
@@ -647,8 +656,9 @@
         elif tokenH == 0xA0:  # array
             s = self._get_size(tokenL)
             obj_refs = self._read_refs(s)
-            return [self._read_object(self._object_offsets[x])
-                for x in obj_refs]
+            result = []
+            self._objects[ref] = result
+            result.extend(self._read_object(x) for x in obj_refs)
 
         # tokenH == 0xB0 is documented as 'ordset', but is not actually
         # implemented in the Apple reference code.
@@ -661,12 +671,15 @@
             key_refs = self._read_refs(s)
             obj_refs = self._read_refs(s)
             result = self._dict_type()
+            self._objects[ref] = result
             for k, o in zip(key_refs, obj_refs):
-                result[self._read_object(self._object_offsets[k])
-                    ] = self._read_object(self._object_offsets[o])
-            return result
+                result[self._read_object(k)] = self._read_object(o)
 
-        raise InvalidFileException()
+        else:
+            raise InvalidFileException()
+
+        self._objects[ref] = result
+        return result
 
 def _count_to_size(count):
     if count < 1 << 8:
@@ -681,6 +694,8 @@
     else:
         return 8
 
+_scalars = (str, int, float, datetime.datetime, bytes)
+
 class _BinaryPlistWriter (object):
     def __init__(self, fp, sort_keys, skipkeys):
         self._fp = fp
@@ -736,8 +751,7 @@
         # First check if the object is in the object table, not used for
         # containers to ensure that two subcontainers with the same contents
         # will be serialized as distinct values.
-        if isinstance(value, (
-                str, int, float, datetime.datetime, bytes, bytearray)):
+        if isinstance(value, _scalars):
             if (type(value), value) in self._objtable:
                 return
 
@@ -745,15 +759,17 @@
             if (type(value.data), value.data) in self._objtable:
                 return
 
+        elif id(value) in self._objidtable:
+            return
+
         # Add to objectreference map
         refnum = len(self._objlist)
         self._objlist.append(value)
-        try:
-            if isinstance(value, Data):
-                self._objtable[(type(value.data), value.data)] = refnum
-            else:
-                self._objtable[(type(value), value)] = refnum
-        except TypeError:
+        if isinstance(value, _scalars):
+            self._objtable[(type(value), value)] = refnum
+        elif isinstance(value, Data):
+            self._objtable[(type(value.data), value.data)] = refnum
+        else:
             self._objidtable[id(value)] = refnum
 
         # And finally recurse into containers
@@ -780,12 +796,11 @@
                 self._flatten(o)
 
     def _getrefnum(self, value):
-        try:
-            if isinstance(value, Data):
-                return self._objtable[(type(value.data), value.data)]
-            else:
-                return self._objtable[(type(value), value)]
-        except TypeError:
+        if isinstance(value, _scalars):
+            return self._objtable[(type(value), value)]
+        elif isinstance(value, Data):
+            return self._objtable[(type(value.data), value.data)]
+        else:
             return self._objidtable[id(value)]
 
     def _write_size(self, token, size):
commit	a897aeeef647259a938a36cb5eb6680c86021c6a	[log] [tgz]
author	Serhiy Storchaka <storchaka@gmail.com>	Thu Nov 30 23:26:11 2017 +0200
committer	GitHub <noreply@github.com>	Thu Nov 30 23:26:11 2017 +0200
tree	2659012d46b7222259d37a81a5f3d5fb330310eb
parent	b4d1e1f7c1af6ae33f0e371576c8bcafedb099db [diff] [blame]