Now that proto2 is defined, replaced XXX blocks with text about it. Also moved the proto2 opcode descriptors into the sections they fit (like TUPLE{1,2,3} in the group of tuple-building opcodes; etc).

commit: fdc03462b3e0796ae6474da6f0f9844773d1da8f [log] [tgz]
author: Tim Peters <tim.peters@gmail.com> Tue Jan 28 04:56:33 2003 +0000
committer: Tim Peters <tim.peters@gmail.com> Tue Jan 28 04:56:33 2003 +0000
tree: 0d6ae8e0890e03b7778ec5d0d457a562f7a5c799
parent: 7d97d31a1b254104037c3653fa2229cb273fa713 [diff] [blame]
diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index 152ea88..a2ba09d 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py

@@ -70,10 +70,12 @@
 + Backward compatibility and micro-optimization.  As explained below,
   pickle opcodes never go away, not even when better ways to do a thing
   get invented.  The repertoire of the PM just keeps growing over time.
-  So, e.g., there are now five distinct opcodes for building a Python integer,
-  four of them devoted to "short" integers.  Even so, the only way to pickle
-  a Python long int takes time quadratic in the number of digits, for both
-  pickling and unpickling.  This isn't so much a subtlety as a source of
+  For example, protocol 0 had two opcodes for building Python integers (INT
+  and LONG), protocol 1 added three more for more-efficient pickling of short
+  integers, and protocol 2 added two more for more-efficient pickling of
+  long integers (before protocol 2, the only ways to pickle a Python long
+  took time quadratic in the number of digits, for both pickling and
+  unpickling).  "Opcode bloat" isn't so much a subtlety as a source of
   wearying complication.
 
 
@@ -93,7 +95,8 @@
 The original pickle used what's now called "protocol 0", and what was called
 "text mode" before Python 2.3.  The entire pickle bytestream is made up of
 printable 7-bit ASCII characters, plus the newline character, in protocol 0.
-That's why it was called text mode.
+That's why it was called text mode.  Protocol 0 is small and elegant, but
+sometimes painfully inefficient.
 
 The second major set of additions is now called "protocol 1", and was called
 "binary mode" before Python 2.3.  This added many opcodes with arguments
@@ -101,10 +104,27 @@
 bytes.  Binary mode pickles can be substantially smaller than equivalent
 text mode pickles, and sometimes faster too; e.g., BININT represents a 4-byte
 int as 4 bytes following the opcode, which is cheaper to unpickle than the
-(perhaps) 11-character decimal string attached to INT.
+(perhaps) 11-character decimal string attached to INT.  Protocol 1 also added
+a number of opcodes that operate on many stack elements at once (like APPENDS
+and SETITEMS).
 
 The third major set of additions came in Python 2.3, and is called "protocol
-2".  XXX Write a short blurb when Guido figures out what they are <wink>. XXX
+2".  This added:
+
+- A better way to pickle instances of new-style classes (NEWOBJ).
+
+- A way for a pickle to identify its protocol (PROTO).
+
+- Time- and space- efficient pickling of long ints (LONG{1,4}).
+
+- Shortcuts for small tuples (TUPLE{1,2,3}}.
+
+- Dedicated opcodes for bools (NEWTRUE, NEWFALSE).
+
+- The "extension registry", a vector of popular objects that can be pushed
+  efficiently by index (EXT{1,2,4}).  This is akin to the memo and GET, but
+  the registry contents are predefined (there's nothing akin to the memo's
+  PUT).
 """
 
 # Meta-rule:  Descriptions are stored in instances of descriptor objects,
@@ -889,19 +909,6 @@
       earlier unpicklers ignore the leading "0" and return the int.
       """),
 
-    I(name='LONG',
-      code='L',
-      arg=decimalnl_long,
-      stack_before=[],
-      stack_after=[pylong],
-      proto=0,
-      doc="""Push a long integer.
-
-      The same as INT, except that the literal ends with 'L', and always
-      unpickles to a Python long.  There doesn't seem a real purpose to the
-      trailing 'L'.
-      """),
-
     I(name='BININT',
       code='J',
       arg=int4,
@@ -941,6 +948,46 @@
       BININT2, but BININT1 instead saves a byte.
       """),
 
+    I(name='LONG',
+      code='L',
+      arg=decimalnl_long,
+      stack_before=[],
+      stack_after=[pylong],
+      proto=0,
+      doc="""Push a long integer.
+
+      The same as INT, except that the literal ends with 'L', and always
+      unpickles to a Python long.  There doesn't seem a real purpose to the
+      trailing 'L'.
+
+      Note that LONG takes time quadratic in the number of digits when
+      unpickling (this is simply due to the nature of decimal->binary
+      conversion).  Proto 2 added linear-time (in C; still quadratic-time
+      in Python) LONG1 and LONG4 opcodes.
+      """),
+
+    I(name="LONG1",
+      code='\x8a',
+      arg=long1,
+      stack_before=[],
+      stack_after=[pylong],
+      proto=2,
+      doc="""Long integer using one-byte length.
+
+      A more efficient encoding of a Python long; the long1 encoding
+      says it all."""),
+
+    I(name="LONG4",
+      code='\x8b',
+      arg=long4,
+      stack_before=[],
+      stack_after=[pylong],
+      proto=2,
+      doc="""Long integer using found-byte length.
+
+      A more efficient encoding of a Python long; the long4 encoding
+      says it all."""),
+
     # Ways to spell strings (8-bit, not Unicode).
 
     I(name='STRING',
@@ -992,6 +1039,29 @@
       proto=0,
       doc="Push None on the stack."),
 
+    # Ways to spell bools, starting with proto 2.  See INT for how this was
+    # done before proto 2.
+
+    I(name='NEWTRUE',
+      code='\x88',
+      arg=None,
+      stack_before=[],
+      stack_after=[pybool],
+      proto=2,
+      doc="""True.
+
+      Push True onto the stack."""),
+
+    I(name='NEWFALSE',
+      code='\x89',
+      arg=None,
+      stack_before=[],
+      stack_after=[pybool],
+      proto=2,
+      doc="""True.
+
+      Push False onto the stack."""),
+
     # Ways to spell Unicode strings.
 
     I(name='UNICODE',
@@ -1134,6 +1204,48 @@
       Stack after:  ... (1, 2, 3, 'abc')
       """),
 
+    I(name='TUPLE1',
+      code='\x85',
+      arg=None,
+      stack_before=[anyobject],
+      stack_after=[pytuple],
+      proto=2,
+      doc="""One-tuple.
+
+      This code pops one value off the stack and pushes a tuple of
+      length 1 whose one item is that value back onto it.  IOW:
+
+          stack[-1] = tuple(stack[-1:])
+      """),
+
+    I(name='TUPLE2',
+      code='\x86',
+      arg=None,
+      stack_before=[anyobject, anyobject],
+      stack_after=[pytuple],
+      proto=2,
+      doc="""One-tuple.
+
+      This code pops two values off the stack and pushes a tuple
+      of length 2 whose items are those values back onto it.  IOW:
+
+          stack[-2:] = [tuple(stack[-2:])]
+      """),
+
+    I(name='TUPLE3',
+      code='\x87',
+      arg=None,
+      stack_before=[anyobject, anyobject, anyobject],
+      stack_after=[pytuple],
+      proto=2,
+      doc="""One-tuple.
+
+      This code pops three values off the stack and pushes a tuple
+      of length 3 whose items are those values back onto it.  IOW:
+
+          stack[-3:] = [tuple(stack[-3:])]
+      """),
+
     # Ways to build dicts.
 
     I(name='EMPTY_DICT',
@@ -1317,6 +1429,53 @@
       signed little-endian integer following.
       """),
 
+    # Access the extension registry (predefined objects).  Akin to the GET
+    # family.
+
+    I(name='EXT1',
+      code='\x82',
+      arg=uint1,
+      stack_before=[],
+      stack_after=[anyobject],
+      proto=2,
+      doc="""Extension code.
+
+      This code and the similar EXT2 and EXT4 allow using a registry
+      of popular objects that are pickled by name, typically classes.
+      It is envisioned that through a global negotiation and
+      registration process, third parties can set up a mapping between
+      ints and object names.
+
+      In order to guarantee pickle interchangeability, the extension
+      code registry ought to be global, although a range of codes may
+      be reserved for private use.
+
+      EXT1 has a 1-byte integer argument.  This is used to index into the
+      extension registry, and the object at that index is pushed on the stack.
+      """),
+
+    I(name='EXT2',
+      code='\x83',
+      arg=uint2,
+      stack_before=[],
+      stack_after=[anyobject],
+      proto=2,
+      doc="""Extension code.
+
+      See EXT1.  EXT2 has a two-byte integer argument.
+      """),
+
+    I(name='EXT4',
+      code='\x84',
+      arg=int4,
+      stack_before=[],
+      stack_after=[anyobject],
+      proto=2,
+      doc="""Extension code.
+
+      See EXT1.  EXT4 has a four-byte integer argument.
+      """),
+
     # Push a class object, or module function, on the stack, via its module
     # and name.
 
@@ -1487,8 +1646,35 @@
       like a bug).  See INST for the gory details.
       """),
 
+    I(name='NEWOBJ',
+      code='\x81',
+      arg=None,
+      stack_before=[anyobject, anyobject],
+      stack_after=[anyobject],
+      proto=2,
+      doc="""Build an object instance.
+
+      The stack before should be thought of as containing a class
+      object followed by an argument tuple (the tuple being the stack
+      top).  Call these cls and args.  They are popped off the stack,
+      and the value returned by cls.__new__(cls, *args) is pushed back
+      onto the stack.
+      """),
+
     # Machine control.
 
+    I(name='PROTO',
+      code='\x80',
+      arg=uint1,
+      stack_before=[],
+      stack_after=[],
+      proto=2,
+      doc="""Protocol version indicator.
+
+      For protocol 2 and above, a pickle must start with this opcode.
+      The argument is the protocol version, an int in range(2, 256).
+      """),
+
     I(name='STOP',
       code='.',
       arg=None,
@@ -1534,161 +1720,6 @@
       ID is passed to self.persistent_load(), and whatever object that
       returns is pushed on the stack.  See PERSID for more detail.
       """),
-
-    # Protocol 2 opcodes
-
-    I(name='PROTO',
-      code='\x80',
-      arg=uint1,
-      stack_before=[],
-      stack_after=[],
-      proto=2,
-      doc="""Protocol version indicator.
-
-      For protocol 2 and above, a pickle must start with this opcode.
-      The argument is the protocol version, an int in range(2, 256).
-      """),
-
-    I(name='NEWOBJ',
-      code='\x81',
-      arg=None,
-      stack_before=[anyobject, anyobject],
-      stack_after=[anyobject],
-      proto=2,
-      doc="""Build an object instance.
-
-      The stack before should be thought of as containing a class
-      object followed by an argument tuple (the tuple being the stack
-      top).  Call these cls and args.  They are popped off the stack,
-      and the value returned by cls.__new__(cls, *args) is pushed back
-      onto the stack.
-      """),
-
-    I(name='EXT1',
-      code='\x82',
-      arg=uint1,
-      stack_before=[],
-      stack_after=[anyobject],
-      proto=2,
-      doc="""Extension code.
-
-      This code and the similar EXT2 and EXT4 allow using a registry
-      of popular objects that are pickled by name, typically classes.
-      It is envisioned that through a global negotiation and
-      registration process, third parties can set up a mapping between
-      ints and object names.
-
-      In order to guarantee pickle interchangeability, the extension
-      code registry ought to be global, although a range of codes may
-      be reserved for private use.
-      """),
-
-    I(name='EXT2',
-      code='\x83',
-      arg=uint2,
-      stack_before=[],
-      stack_after=[anyobject],
-      proto=2,
-      doc="""Extension code.
-
-      See EXT1.
-      """),
-
-    I(name='EXT4',
-      code='\x84',
-      arg=int4,
-      stack_before=[],
-      stack_after=[anyobject],
-      proto=2,
-      doc="""Extension code.
-
-      See EXT1.
-      """),
-
-    I(name='TUPLE1',
-      code='\x85',
-      arg=None,
-      stack_before=[anyobject],
-      stack_after=[pytuple],
-      proto=2,
-      doc="""One-tuple.
-
-      This code pops one value off the stack and pushes a tuple of
-      length 1 whose one item is that value back onto it.  IOW:
-
-          stack[-1] = tuple(stack[-1:])
-      """),
-
-    I(name='TUPLE2',
-      code='\x86',
-      arg=None,
-      stack_before=[anyobject, anyobject],
-      stack_after=[pytuple],
-      proto=2,
-      doc="""One-tuple.
-
-      This code pops two values off the stack and pushes a tuple
-      of length 2 whose items are those values back onto it.  IOW:
-
-          stack[-2:] = [tuple(stack[-2:])]
-      """),
-
-    I(name='TUPLE3',
-      code='\x87',
-      arg=None,
-      stack_before=[anyobject, anyobject, anyobject],
-      stack_after=[pytuple],
-      proto=2,
-      doc="""One-tuple.
-
-      This code pops three values off the stack and pushes a tuple
-      of length 3 whose items are those values back onto it.  IOW:
-
-          stack[-3:] = [tuple(stack[-3:])]
-      """),
-
-    I(name='NEWTRUE',
-      code='\x88',
-      arg=None,
-      stack_before=[],
-      stack_after=[pybool],
-      proto=2,
-      doc="""True.
-
-      Push True onto the stack."""),
-
-    I(name='NEWFALSE',
-      code='\x89',
-      arg=None,
-      stack_before=[],
-      stack_after=[pybool],
-      proto=2,
-      doc="""True.
-
-      Push False onto the stack."""),
-
-    I(name="LONG1",
-      code='\x8a',
-      arg=long1,
-      stack_before=[],
-      stack_after=[pylong],
-      proto=2,
-      doc="""Long integer using one-byte length.
-
-      A more efficient encoding of a Python long; the long1 encoding
-      says it all."""),
-
-    I(name="LONG4",
-      code='\x8b',
-      arg=long4,
-      stack_before=[],
-      stack_after=[pylong],
-      proto=2,
-      doc="""Long integer using found-byte length.
-
-      A more efficient encoding of a Python long; the long4 encoding
-      says it all."""),
-
 ]
 del I
commit	fdc03462b3e0796ae6474da6f0f9844773d1da8f	[log] [tgz]
author	Tim Peters <tim.peters@gmail.com>	Tue Jan 28 04:56:33 2003 +0000
committer	Tim Peters <tim.peters@gmail.com>	Tue Jan 28 04:56:33 2003 +0000
tree	0d6ae8e0890e03b7778ec5d0d457a562f7a5c799
parent	7d97d31a1b254104037c3653fa2229cb273fa713 [diff] [blame]