Blame - Lib/pickletools.py - platform/external/python/cpython3

blob: 386a95c209b1985971ea5fc9c5356ba98ecbb0aa [file] [log] [blame]

Skip Montanaro	5445594	2003-01-29 15:41:33 +0000	[diff] [blame]	1	'''"Executable documentation" for the pickle module.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2
				3	Extensive comments about the pickle protocols and pickle-machine opcodes
				4	can be found here. Some functions meant for external use:
				5
				6	genops(pickle)
				7	Generate all the opcodes in a pickle, as (opcode, arg, position) triples.
				8
Andrew M. Kuchling	d0c53fe	2004-08-07 16:51:30 +0000	[diff] [blame]	9	dis(pickle, out=None, memo=None, indentlevel=4)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	10	Print a symbolic disassembly of a pickle.
Skip Montanaro	5445594	2003-01-29 15:41:33 +0000	[diff] [blame]	11	'''
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	12
Walter Dörwald	42748a8	2007-06-12 16:40:17 +0000	[diff] [blame]	13	import codecs
Guido van Rossum	98297ee	2007-11-06 21:34:58 +0000	[diff] [blame]	14	import pickle
				15	import re
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	16	import sys
Walter Dörwald	42748a8	2007-06-12 16:40:17 +0000	[diff] [blame]	17
Christian Heimes	3feef61	2008-02-11 06:19:17 +0000	[diff] [blame]	18	__all__ = ['dis', 'genops', 'optimize']
Tim Peters	90cf212	2004-11-06 23:45:48 +0000	[diff] [blame]	19
Guido van Rossum	98297ee	2007-11-06 21:34:58 +0000	[diff] [blame]	20	bytes_types = pickle.bytes_types
				21
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	22	# Other ideas:
				23	#
				24	# - A pickle verifier: read a pickle and check it exhaustively for
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	25	# well-formedness. dis() does a lot of this already.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	26	#
				27	# - A protocol identifier: examine a pickle and return its protocol number
				28	# (== the highest .proto attr value among all the opcodes in the pickle).
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	29	# dis() already prints this info at the end.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	30	#
				31	# - A pickle optimizer: for example, tuple-building code is sometimes more
				32	# elaborate than necessary, catering for the possibility that the tuple
				33	# is recursive. Or lots of times a PUT is generated that's never accessed
				34	# by a later GET.
				35
				36
Victor Stinner	765531d	2013-03-26 01:11:54 +0100	[diff] [blame]	37	# "A pickle" is a program for a virtual pickle machine (PM, but more accurately
				38	# called an unpickling machine). It's a sequence of opcodes, interpreted by the
				39	# PM, building an arbitrarily complex Python object.
				40	#
				41	# For the most part, the PM is very simple: there are no looping, testing, or
				42	# conditional instructions, no arithmetic and no function calls. Opcodes are
				43	# executed once each, from first to last, until a STOP opcode is reached.
				44	#
				45	# The PM has two data areas, "the stack" and "the memo".
				46	#
				47	# Many opcodes push Python objects onto the stack; e.g., INT pushes a Python
				48	# integer object on the stack, whose value is gotten from a decimal string
				49	# literal immediately following the INT opcode in the pickle bytestream. Other
				50	# opcodes take Python objects off the stack. The result of unpickling is
				51	# whatever object is left on the stack when the final STOP opcode is executed.
				52	#
				53	# The memo is simply an array of objects, or it can be implemented as a dict
				54	# mapping little integers to objects. The memo serves as the PM's "long term
				55	# memory", and the little integers indexing the memo are akin to variable
				56	# names. Some opcodes pop a stack object into the memo at a given index,
				57	# and others push a memo object at a given index onto the stack again.
				58	#
				59	# At heart, that's all the PM has. Subtleties arise for these reasons:
				60	#
				61	# + Object identity. Objects can be arbitrarily complex, and subobjects
				62	# may be shared (for example, the list [a, a] refers to the same object a
				63	# twice). It can be vital that unpickling recreate an isomorphic object
				64	# graph, faithfully reproducing sharing.
				65	#
				66	# + Recursive objects. For example, after "L = []; L.append(L)", L is a
				67	# list, and L[0] is the same list. This is related to the object identity
				68	# point, and some sequences of pickle opcodes are subtle in order to
				69	# get the right result in all cases.
				70	#
				71	# + Things pickle doesn't know everything about. Examples of things pickle
				72	# does know everything about are Python's builtin scalar and container
				73	# types, like ints and tuples. They generally have opcodes dedicated to
				74	# them. For things like module references and instances of user-defined
				75	# classes, pickle's knowledge is limited. Historically, many enhancements
				76	# have been made to the pickle protocol in order to do a better (faster,
				77	# and/or more compact) job on those.
				78	#
				79	# + Backward compatibility and micro-optimization. As explained below,
				80	# pickle opcodes never go away, not even when better ways to do a thing
				81	# get invented. The repertoire of the PM just keeps growing over time.
				82	# For example, protocol 0 had two opcodes for building Python integers (INT
				83	# and LONG), protocol 1 added three more for more-efficient pickling of short
				84	# integers, and protocol 2 added two more for more-efficient pickling of
				85	# long integers (before protocol 2, the only ways to pickle a Python long
				86	# took time quadratic in the number of digits, for both pickling and
				87	# unpickling). "Opcode bloat" isn't so much a subtlety as a source of
				88	# wearying complication.
				89	#
				90	#
				91	# Pickle protocols:
				92	#
				93	# For compatibility, the meaning of a pickle opcode never changes. Instead new
				94	# pickle opcodes get added, and each version's unpickler can handle all the
				95	# pickle opcodes in all protocol versions to date. So old pickles continue to
				96	# be readable forever. The pickler can generally be told to restrict itself to
				97	# the subset of opcodes available under previous protocol versions too, so that
				98	# users can create pickles under the current version readable by older
				99	# versions. However, a pickle does not contain its version number embedded
				100	# within it. If an older unpickler tries to read a pickle using a later
				101	# protocol, the result is most likely an exception due to seeing an unknown (in
				102	# the older unpickler) opcode.
				103	#
				104	# The original pickle used what's now called "protocol 0", and what was called
				105	# "text mode" before Python 2.3. The entire pickle bytestream is made up of
				106	# printable 7-bit ASCII characters, plus the newline character, in protocol 0.
				107	# That's why it was called text mode. Protocol 0 is small and elegant, but
				108	# sometimes painfully inefficient.
				109	#
				110	# The second major set of additions is now called "protocol 1", and was called
				111	# "binary mode" before Python 2.3. This added many opcodes with arguments
				112	# consisting of arbitrary bytes, including NUL bytes and unprintable "high bit"
				113	# bytes. Binary mode pickles can be substantially smaller than equivalent
				114	# text mode pickles, and sometimes faster too; e.g., BININT represents a 4-byte
				115	# int as 4 bytes following the opcode, which is cheaper to unpickle than the
				116	# (perhaps) 11-character decimal string attached to INT. Protocol 1 also added
				117	# a number of opcodes that operate on many stack elements at once (like APPENDS
				118	# and SETITEMS), and "shortcut" opcodes (like EMPTY_DICT and EMPTY_TUPLE).
				119	#
				120	# The third major set of additions came in Python 2.3, and is called "protocol
				121	# 2". This added:
				122	#
				123	# - A better way to pickle instances of new-style classes (NEWOBJ).
				124	#
				125	# - A way for a pickle to identify its protocol (PROTO).
				126	#
				127	# - Time- and space- efficient pickling of long ints (LONG{1,4}).
				128	#
				129	# - Shortcuts for small tuples (TUPLE{1,2,3}}.
				130	#
				131	# - Dedicated opcodes for bools (NEWTRUE, NEWFALSE).
				132	#
				133	# - The "extension registry", a vector of popular objects that can be pushed
				134	# efficiently by index (EXT{1,2,4}). This is akin to the memo and GET, but
				135	# the registry contents are predefined (there's nothing akin to the memo's
				136	# PUT).
				137	#
				138	# Another independent change with Python 2.3 is the abandonment of any
				139	# pretense that it might be safe to load pickles received from untrusted
				140	# parties -- no sufficient security analysis has been done to guarantee
				141	# this and there isn't a use case that warrants the expense of such an
				142	# analysis.
				143	#
				144	# To this end, all tests for __safe_for_unpickling__ or for
				145	# copyreg.safe_constructors are removed from the unpickling code.
				146	# References to these variables in the descriptions below are to be seen
				147	# as describing unpickling in Python 2.2 and before.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	148
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	149
				150	# Meta-rule: Descriptions are stored in instances of descriptor objects,
				151	# with plain constructors. No meta-language is defined from which
				152	# descriptors could be constructed. If you want, e.g., XML, write a little
				153	# program to generate XML from the objects.
				154
				155	##############################################################################
				156	# Some pickle opcodes have an argument, following the opcode in the
				157	# bytestream. An argument is of a specific type, described by an instance
				158	# of ArgumentDescriptor. These are not to be confused with arguments taken
				159	# off the stack -- ArgumentDescriptor applies only to arguments embedded in
				160	# the opcode stream, immediately following an opcode.
				161
				162	# Represents the number of bytes consumed by an argument delimited by the
				163	# next newline character.
				164	UP_TO_NEWLINE = -1
				165
				166	# Represents the number of bytes consumed by a two-argument opcode where
				167	# the first argument gives the number of bytes in the second argument.
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	168	TAKEN_FROM_ARGUMENT1 = -2 # num bytes is 1-byte unsigned int
				169	TAKEN_FROM_ARGUMENT4 = -3 # num bytes is 4-byte signed little-endian int
				170	TAKEN_FROM_ARGUMENT4U = -4 # num bytes is 4-byte unsigned little-endian int
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	171
				172	class ArgumentDescriptor(object):
				173	__slots__ = (
				174	# name of descriptor record, also a module global name; a string
				175	'name',
				176
				177	# length of argument, in bytes; an int; UP_TO_NEWLINE and
Tim Peters	fdb8cfa	2003-01-28 00:13:19 +0000	[diff] [blame]	178	# TAKEN_FROM_ARGUMENT{1,4} are negative values for variable-length
				179	# cases
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	180	'n',
				181
				182	# a function taking a file-like object, reading this kind of argument
				183	# from the object at the current position, advancing the current
				184	# position by n bytes, and returning the value of the argument
				185	'reader',
				186
				187	# human-readable docs for this arg descriptor; a string
				188	'doc',
				189	)
				190
				191	def __init__(self, name, n, reader, doc):
				192	assert isinstance(name, str)
				193	self.name = name
				194
				195	assert isinstance(n, int) and (n >= 0 or
Tim Peters	fdb8cfa	2003-01-28 00:13:19 +0000	[diff] [blame]	196	n in (UP_TO_NEWLINE,
				197	TAKEN_FROM_ARGUMENT1,
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	198	TAKEN_FROM_ARGUMENT4,
				199	TAKEN_FROM_ARGUMENT4U))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	200	self.n = n
				201
				202	self.reader = reader
				203
				204	assert isinstance(doc, str)
				205	self.doc = doc
				206
				207	from struct import unpack as _unpack
				208
				209	def read_uint1(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	210	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	211	>>> import io
				212	>>> read_uint1(io.BytesIO(b'\xff'))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	213	255
				214	"""
				215
				216	data = f.read(1)
				217	if data:
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	218	return data[0]
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	219	raise ValueError("not enough data in stream to read uint1")
				220
				221	uint1 = ArgumentDescriptor(
				222	name='uint1',
				223	n=1,
				224	reader=read_uint1,
				225	doc="One-byte unsigned integer.")
				226
				227
				228	def read_uint2(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	229	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	230	>>> import io
				231	>>> read_uint2(io.BytesIO(b'\xff\x00'))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	232	255
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	233	>>> read_uint2(io.BytesIO(b'\xff\xff'))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	234	65535
				235	"""
				236
				237	data = f.read(2)
				238	if len(data) == 2:
				239	return _unpack("<H", data)[0]
				240	raise ValueError("not enough data in stream to read uint2")
				241
				242	uint2 = ArgumentDescriptor(
				243	name='uint2',
				244	n=2,
				245	reader=read_uint2,
				246	doc="Two-byte unsigned integer, little-endian.")
				247
				248
				249	def read_int4(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	250	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	251	>>> import io
				252	>>> read_int4(io.BytesIO(b'\xff\x00\x00\x00'))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	253	255
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	254	>>> read_int4(io.BytesIO(b'\x00\x00\x00\x80')) == -(2**31)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	255	True
				256	"""
				257
				258	data = f.read(4)
				259	if len(data) == 4:
				260	return _unpack("<i", data)[0]
				261	raise ValueError("not enough data in stream to read int4")
				262
				263	int4 = ArgumentDescriptor(
				264	name='int4',
				265	n=4,
				266	reader=read_int4,
				267	doc="Four-byte signed integer, little-endian, 2's complement.")
				268
				269
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	270	def read_uint4(f):
				271	r"""
				272	>>> import io
				273	>>> read_uint4(io.BytesIO(b'\xff\x00\x00\x00'))
				274	255
				275	>>> read_uint4(io.BytesIO(b'\x00\x00\x00\x80')) == 2**31
				276	True
				277	"""
				278
				279	data = f.read(4)
				280	if len(data) == 4:
				281	return _unpack("<I", data)[0]
				282	raise ValueError("not enough data in stream to read uint4")
				283
				284	uint4 = ArgumentDescriptor(
				285	name='uint4',
				286	n=4,
				287	reader=read_uint4,
				288	doc="Four-byte unsigned integer, little-endian.")
				289
				290
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	291	def read_stringnl(f, decode=True, stripquotes=True):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	292	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	293	>>> import io
				294	>>> read_stringnl(io.BytesIO(b"'abcd'\nefg\n"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	295	'abcd'
				296
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	297	>>> read_stringnl(io.BytesIO(b"\n"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	298	Traceback (most recent call last):
				299	...
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	300	ValueError: no string quotes around b''
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	301
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	302	>>> read_stringnl(io.BytesIO(b"\n"), stripquotes=False)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	303	''
				304
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	305	>>> read_stringnl(io.BytesIO(b"''\n"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	306	''
				307
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	308	>>> read_stringnl(io.BytesIO(b'"abcd"'))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	309	Traceback (most recent call last):
				310	...
				311	ValueError: no newline found when trying to read stringnl
				312
				313	Embedded escapes are undone in the result.
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	314	>>> read_stringnl(io.BytesIO(br"'a\n\\b\x00c\td'" + b"\n'e'"))
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	315	'a\n\\b\x00c\td'
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	316	"""
				317
Guido van Rossum	2698631	2007-07-17 00:19:46 +0000	[diff] [blame]	318	data = f.readline()
Guido van Rossum	26d95c3	2007-08-27 23:18:54 +0000	[diff] [blame]	319	if not data.endswith(b'\n'):
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	320	raise ValueError("no newline found when trying to read stringnl")
				321	data = data[:-1] # lose the newline
				322
				323	if stripquotes:
Guido van Rossum	26d95c3	2007-08-27 23:18:54 +0000	[diff] [blame]	324	for q in (b'"', b"'"):
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	325	if data.startswith(q):
				326	if not data.endswith(q):
				327	raise ValueError("strinq quote %r not found at both "
				328	"ends of %r" % (q, data))
				329	data = data[1:-1]
				330	break
				331	else:
				332	raise ValueError("no string quotes around %r" % data)
				333
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	334	if decode:
Guido van Rossum	98297ee	2007-11-06 21:34:58 +0000	[diff] [blame]	335	data = codecs.escape_decode(data)[0].decode("ascii")
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	336	return data
				337
				338	stringnl = ArgumentDescriptor(
				339	name='stringnl',
				340	n=UP_TO_NEWLINE,
				341	reader=read_stringnl,
				342	doc="""A newline-terminated string.
				343
				344	This is a repr-style string, with embedded escapes, and
				345	bracketing quotes.
				346	""")
				347
				348	def read_stringnl_noescape(f):
Guido van Rossum	98297ee	2007-11-06 21:34:58 +0000	[diff] [blame]	349	return read_stringnl(f, stripquotes=False)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	350
				351	stringnl_noescape = ArgumentDescriptor(
				352	name='stringnl_noescape',
				353	n=UP_TO_NEWLINE,
				354	reader=read_stringnl_noescape,
				355	doc="""A newline-terminated string.
				356
				357	This is a str-style string, without embedded escapes,
				358	or bracketing quotes. It should consist solely of
				359	printable ASCII characters.
				360	""")
				361
				362	def read_stringnl_noescape_pair(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	363	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	364	>>> import io
				365	>>> read_stringnl_noescape_pair(io.BytesIO(b"Queue\nEmpty\njunk"))
Tim Peters	d916cf4	2003-01-27 19:01:47 +0000	[diff] [blame]	366	'Queue Empty'
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	367	"""
				368
Tim Peters	d916cf4	2003-01-27 19:01:47 +0000	[diff] [blame]	369	return "%s %s" % (read_stringnl_noescape(f), read_stringnl_noescape(f))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	370
				371	stringnl_noescape_pair = ArgumentDescriptor(
				372	name='stringnl_noescape_pair',
				373	n=UP_TO_NEWLINE,
				374	reader=read_stringnl_noescape_pair,
				375	doc="""A pair of newline-terminated strings.
				376
				377	These are str-style strings, without embedded
				378	escapes, or bracketing quotes. They should
				379	consist solely of printable ASCII characters.
				380	The pair is returned as a single string, with
Tim Peters	d916cf4	2003-01-27 19:01:47 +0000	[diff] [blame]	381	a single blank separating the two strings.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	382	""")
				383
				384	def read_string4(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	385	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	386	>>> import io
				387	>>> read_string4(io.BytesIO(b"\x00\x00\x00\x00abc"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	388	''
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	389	>>> read_string4(io.BytesIO(b"\x03\x00\x00\x00abcdef"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	390	'abc'
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	391	>>> read_string4(io.BytesIO(b"\x00\x00\x00\x03abcdef"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	392	Traceback (most recent call last):
				393	...
				394	ValueError: expected 50331648 bytes in a string4, but only 6 remain
				395	"""
				396
				397	n = read_int4(f)
				398	if n < 0:
				399	raise ValueError("string4 byte count < 0: %d" % n)
				400	data = f.read(n)
				401	if len(data) == n:
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	402	return data.decode("latin-1")
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	403	raise ValueError("expected %d bytes in a string4, but only %d remain" %
				404	(n, len(data)))
				405
				406	string4 = ArgumentDescriptor(
				407	name="string4",
Tim Peters	fdb8cfa	2003-01-28 00:13:19 +0000	[diff] [blame]	408	n=TAKEN_FROM_ARGUMENT4,
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	409	reader=read_string4,
				410	doc="""A counted string.
				411
				412	The first argument is a 4-byte little-endian signed int giving
				413	the number of bytes in the string, and the second argument is
				414	that many bytes.
				415	""")
				416
				417
				418	def read_string1(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	419	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	420	>>> import io
				421	>>> read_string1(io.BytesIO(b"\x00"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	422	''
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	423	>>> read_string1(io.BytesIO(b"\x03abcdef"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	424	'abc'
				425	"""
				426
				427	n = read_uint1(f)
				428	assert n >= 0
				429	data = f.read(n)
				430	if len(data) == n:
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	431	return data.decode("latin-1")
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	432	raise ValueError("expected %d bytes in a string1, but only %d remain" %
				433	(n, len(data)))
				434
				435	string1 = ArgumentDescriptor(
				436	name="string1",
Tim Peters	fdb8cfa	2003-01-28 00:13:19 +0000	[diff] [blame]	437	n=TAKEN_FROM_ARGUMENT1,
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	438	reader=read_string1,
				439	doc="""A counted string.
				440
				441	The first argument is a 1-byte unsigned int giving the number
				442	of bytes in the string, and the second argument is that many
				443	bytes.
				444	""")
				445
				446
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	447	def read_bytes1(f):
				448	r"""
				449	>>> import io
				450	>>> read_bytes1(io.BytesIO(b"\x00"))
				451	b''
				452	>>> read_bytes1(io.BytesIO(b"\x03abcdef"))
				453	b'abc'
				454	"""
				455
				456	n = read_uint1(f)
				457	assert n >= 0
				458	data = f.read(n)
				459	if len(data) == n:
				460	return data
				461	raise ValueError("expected %d bytes in a bytes1, but only %d remain" %
				462	(n, len(data)))
				463
				464	bytes1 = ArgumentDescriptor(
				465	name="bytes1",
				466	n=TAKEN_FROM_ARGUMENT1,
				467	reader=read_bytes1,
				468	doc="""A counted bytes string.
				469
				470	The first argument is a 1-byte unsigned int giving the number
				471	of bytes, and the second argument is that many bytes.
				472	""")
				473
				474
				475	def read_bytes4(f):
				476	r"""
				477	>>> import io
				478	>>> read_bytes4(io.BytesIO(b"\x00\x00\x00\x00abc"))
				479	b''
				480	>>> read_bytes4(io.BytesIO(b"\x03\x00\x00\x00abcdef"))
				481	b'abc'
				482	>>> read_bytes4(io.BytesIO(b"\x00\x00\x00\x03abcdef"))
				483	Traceback (most recent call last):
				484	...
				485	ValueError: expected 50331648 bytes in a bytes4, but only 6 remain
				486	"""
				487
				488	n = read_uint4(f)
				489	if n > sys.maxsize:
				490	raise ValueError("bytes4 byte count > sys.maxsize: %d" % n)
				491	data = f.read(n)
				492	if len(data) == n:
				493	return data
				494	raise ValueError("expected %d bytes in a bytes4, but only %d remain" %
				495	(n, len(data)))
				496
				497	bytes4 = ArgumentDescriptor(
				498	name="bytes4",
				499	n=TAKEN_FROM_ARGUMENT4U,
				500	reader=read_bytes4,
				501	doc="""A counted bytes string.
				502
				503	The first argument is a 4-byte little-endian unsigned int giving
				504	the number of bytes, and the second argument is that many bytes.
				505	""")
				506
				507
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	508	def read_unicodestringnl(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	509	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	510	>>> import io
				511	>>> read_unicodestringnl(io.BytesIO(b"abc\\uabcd\njunk")) == 'abc\uabcd'
				512	True
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	513	"""
				514
Guido van Rossum	2698631	2007-07-17 00:19:46 +0000	[diff] [blame]	515	data = f.readline()
Guido van Rossum	26d95c3	2007-08-27 23:18:54 +0000	[diff] [blame]	516	if not data.endswith(b'\n'):
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	517	raise ValueError("no newline found when trying to read "
				518	"unicodestringnl")
				519	data = data[:-1] # lose the newline
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	520	return str(data, 'raw-unicode-escape')
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	521
				522	unicodestringnl = ArgumentDescriptor(
				523	name='unicodestringnl',
				524	n=UP_TO_NEWLINE,
				525	reader=read_unicodestringnl,
				526	doc="""A newline-terminated Unicode string.
				527
				528	This is raw-unicode-escape encoded, so consists of
				529	printable ASCII characters, and may contain embedded
				530	escape sequences.
				531	""")
				532
				533	def read_unicodestring4(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	534	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	535	>>> import io
				536	>>> s = 'abcd\uabcd'
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	537	>>> enc = s.encode('utf-8')
				538	>>> enc
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	539	b'abcd\xea\xaf\x8d'
				540	>>> n = bytes([len(enc), 0, 0, 0]) # little-endian 4-byte length
				541	>>> t = read_unicodestring4(io.BytesIO(n + enc + b'junk'))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	542	>>> s == t
				543	True
				544
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	545	>>> read_unicodestring4(io.BytesIO(n + enc[:-1]))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	546	Traceback (most recent call last):
				547	...
				548	ValueError: expected 7 bytes in a unicodestring4, but only 6 remain
				549	"""
				550
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	551	n = read_uint4(f)
				552	if n > sys.maxsize:
				553	raise ValueError("unicodestring4 byte count > sys.maxsize: %d" % n)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	554	data = f.read(n)
				555	if len(data) == n:
Victor Stinner	485fb56	2010-04-13 11:07:24 +0000	[diff] [blame]	556	return str(data, 'utf-8', 'surrogatepass')
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	557	raise ValueError("expected %d bytes in a unicodestring4, but only %d "
				558	"remain" % (n, len(data)))
				559
				560	unicodestring4 = ArgumentDescriptor(
				561	name="unicodestring4",
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	562	n=TAKEN_FROM_ARGUMENT4U,
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	563	reader=read_unicodestring4,
				564	doc="""A counted Unicode string.
				565
				566	The first argument is a 4-byte little-endian signed int
				567	giving the number of bytes in the string, and the second
				568	argument-- the UTF-8 encoding of the Unicode string --
				569	contains that many bytes.
				570	""")
				571
				572
				573	def read_decimalnl_short(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	574	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	575	>>> import io
				576	>>> read_decimalnl_short(io.BytesIO(b"1234\n56"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	577	1234
				578
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	579	>>> read_decimalnl_short(io.BytesIO(b"1234L\n56"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	580	Traceback (most recent call last):
				581	...
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	582	ValueError: trailing 'L' not allowed in b'1234L'
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	583	"""
				584
				585	s = read_stringnl(f, decode=False, stripquotes=False)
Guido van Rossum	26d95c3	2007-08-27 23:18:54 +0000	[diff] [blame]	586	if s.endswith(b"L"):
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	587	raise ValueError("trailing 'L' not allowed in %r" % s)
				588
				589	# It's not necessarily true that the result fits in a Python short int:
				590	# the pickle may have been written on a 64-bit box. There's also a hack
				591	# for True and False here.
Jeremy Hylton	a5dc3db	2007-08-29 19:07:40 +0000	[diff] [blame]	592	if s == b"00":
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	593	return False
Jeremy Hylton	a5dc3db	2007-08-29 19:07:40 +0000	[diff] [blame]	594	elif s == b"01":
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	595	return True
				596
Florent Xicluna	2bb96f5	2011-10-23 22:11:00 +0200	[diff] [blame]	597	return int(s)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	598
				599	def read_decimalnl_long(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	600	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	601	>>> import io
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	602
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	603	>>> read_decimalnl_long(io.BytesIO(b"1234L\n56"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	604	1234
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	605
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	606	>>> read_decimalnl_long(io.BytesIO(b"123456789012345678901234L\n6"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	607	123456789012345678901234
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	608	"""
				609
				610	s = read_stringnl(f, decode=False, stripquotes=False)
Mark Dickinson	8dd0514	2009-01-20 20:43:58 +0000	[diff] [blame]	611	if s[-1:] == b'L':
				612	s = s[:-1]
Guido van Rossum	e2a383d	2007-01-15 16:59:06 +0000	[diff] [blame]	613	return int(s)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	614
				615
				616	decimalnl_short = ArgumentDescriptor(
				617	name='decimalnl_short',
				618	n=UP_TO_NEWLINE,
				619	reader=read_decimalnl_short,
				620	doc="""A newline-terminated decimal integer literal.
				621
				622	This never has a trailing 'L', and the integer fit
				623	in a short Python int on the box where the pickle
				624	was written -- but there's no guarantee it will fit
				625	in a short Python int on the box where the pickle
				626	is read.
				627	""")
				628
				629	decimalnl_long = ArgumentDescriptor(
				630	name='decimalnl_long',
				631	n=UP_TO_NEWLINE,
				632	reader=read_decimalnl_long,
				633	doc="""A newline-terminated decimal integer literal.
				634
				635	This has a trailing 'L', and can represent integers
				636	of any size.
				637	""")
				638
				639
				640	def read_floatnl(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	641	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	642	>>> import io
				643	>>> read_floatnl(io.BytesIO(b"-1.25\n6"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	644	-1.25
				645	"""
				646	s = read_stringnl(f, decode=False, stripquotes=False)
				647	return float(s)
				648
				649	floatnl = ArgumentDescriptor(
				650	name='floatnl',
				651	n=UP_TO_NEWLINE,
				652	reader=read_floatnl,
				653	doc="""A newline-terminated decimal floating literal.
				654
				655	In general this requires 17 significant digits for roundtrip
				656	identity, and pickling then unpickling infinities, NaNs, and
				657	minus zero doesn't work across boxes, or on some boxes even
				658	on itself (e.g., Windows can't read the strings it produces
				659	for infinities or NaNs).
				660	""")
				661
				662	def read_float8(f):
Tim Peters	55762f5	2003-01-28 16:01:25 +0000	[diff] [blame]	663	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	664	>>> import io, struct
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	665	>>> raw = struct.pack(">d", -1.25)
				666	>>> raw
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	667	b'\xbf\xf4\x00\x00\x00\x00\x00\x00'
				668	>>> read_float8(io.BytesIO(raw + b"\n"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	669	-1.25
				670	"""
				671
				672	data = f.read(8)
				673	if len(data) == 8:
				674	return _unpack(">d", data)[0]
				675	raise ValueError("not enough data in stream to read float8")
				676
				677
				678	float8 = ArgumentDescriptor(
				679	name='float8',
				680	n=8,
				681	reader=read_float8,
				682	doc="""An 8-byte binary representation of a float, big-endian.
				683
				684	The format is unique to Python, and shared with the struct
Guido van Rossum	99603b0	2007-07-20 00:22:32 +0000	[diff] [blame]	685	module (format string '>d') "in theory" (the struct and pickle
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	686	implementations don't share the code -- they should). It's
				687	strongly related to the IEEE-754 double format, and, in normal
				688	cases, is in fact identical to the big-endian 754 double format.
				689	On other boxes the dynamic range is limited to that of a 754
				690	double, and "add a half and chop" rounding is used to reduce
				691	the precision to 53 bits. However, even on a 754 box,
				692	infinities, NaNs, and minus zero may not be handled correctly
				693	(may not survive roundtrip pickling intact).
				694	""")
				695
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	696	# Protocol 2 formats
				697
Tim Peters	c0c12b5	2003-01-29 00:56:17 +0000	[diff] [blame]	698	from pickle import decode_long
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	699
				700	def read_long1(f):
				701	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	702	>>> import io
				703	>>> read_long1(io.BytesIO(b"\x00"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	704	0
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	705	>>> read_long1(io.BytesIO(b"\x02\xff\x00"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	706	255
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	707	>>> read_long1(io.BytesIO(b"\x02\xff\x7f"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	708	32767
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	709	>>> read_long1(io.BytesIO(b"\x02\x00\xff"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	710	-256
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	711	>>> read_long1(io.BytesIO(b"\x02\x00\x80"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	712	-32768
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	713	"""
				714
				715	n = read_uint1(f)
				716	data = f.read(n)
				717	if len(data) != n:
				718	raise ValueError("not enough data in stream to read long1")
				719	return decode_long(data)
				720
				721	long1 = ArgumentDescriptor(
				722	name="long1",
Tim Peters	fdb8cfa	2003-01-28 00:13:19 +0000	[diff] [blame]	723	n=TAKEN_FROM_ARGUMENT1,
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	724	reader=read_long1,
				725	doc="""A binary long, little-endian, using 1-byte size.
				726
				727	This first reads one byte as an unsigned size, then reads that
Tim Peters	bdbe741	2003-01-27 23:54:04 +0000	[diff] [blame]	728	many bytes and interprets them as a little-endian 2's-complement long.
Tim Peters	4b23f2b	2003-01-31 16:43:39 +0000	[diff] [blame]	729	If the size is 0, that's taken as a shortcut for the long 0L.
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	730	""")
				731
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	732	def read_long4(f):
				733	r"""
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	734	>>> import io
				735	>>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x00"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	736	255
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	737	>>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x7f"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	738	32767
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	739	>>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\xff"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	740	-256
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	741	>>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\x80"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	742	-32768
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	743	>>> read_long1(io.BytesIO(b"\x00\x00\x00\x00"))
Guido van Rossum	e2b70bc	2006-08-18 22:13:04 +0000	[diff] [blame]	744	0
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	745	"""
				746
				747	n = read_int4(f)
				748	if n < 0:
Neal Norwitz	784a3f5	2003-01-28 00:20:41 +0000	[diff] [blame]	749	raise ValueError("long4 byte count < 0: %d" % n)
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	750	data = f.read(n)
				751	if len(data) != n:
Neal Norwitz	784a3f5	2003-01-28 00:20:41 +0000	[diff] [blame]	752	raise ValueError("not enough data in stream to read long4")
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	753	return decode_long(data)
				754
				755	long4 = ArgumentDescriptor(
				756	name="long4",
Tim Peters	fdb8cfa	2003-01-28 00:13:19 +0000	[diff] [blame]	757	n=TAKEN_FROM_ARGUMENT4,
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	758	reader=read_long4,
				759	doc="""A binary representation of a long, little-endian.
				760
				761	This first reads four bytes as a signed size (but requires the
				762	size to be >= 0), then reads that many bytes and interprets them
Tim Peters	4b23f2b	2003-01-31 16:43:39 +0000	[diff] [blame]	763	as a little-endian 2's-complement long. If the size is 0, that's taken
Guido van Rossum	e2a383d	2007-01-15 16:59:06 +0000	[diff] [blame]	764	as a shortcut for the int 0, although LONG1 should really be used
Tim Peters	4b23f2b	2003-01-31 16:43:39 +0000	[diff] [blame]	765	then instead (and in any case where # of bytes < 256).
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	766	""")
				767
				768
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	769	##############################################################################
				770	# Object descriptors. The stack used by the pickle machine holds objects,
				771	# and in the stack_before and stack_after attributes of OpcodeInfo
				772	# descriptors we need names to describe the various types of objects that can
				773	# appear on the stack.
				774
				775	class StackObject(object):
				776	__slots__ = (
				777	# name of descriptor record, for info only
				778	'name',
				779
				780	# type of object, or tuple of type objects (meaning the object can
				781	# be of any type in the tuple)
				782	'obtype',
				783
				784	# human-readable docs for this kind of stack object; a string
				785	'doc',
				786	)
				787
				788	def __init__(self, name, obtype, doc):
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	789	assert isinstance(name, str)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	790	self.name = name
				791
				792	assert isinstance(obtype, type) or isinstance(obtype, tuple)
				793	if isinstance(obtype, tuple):
				794	for contained in obtype:
				795	assert isinstance(contained, type)
				796	self.obtype = obtype
				797
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	798	assert isinstance(doc, str)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	799	self.doc = doc
				800
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	801	def __repr__(self):
				802	return self.name
				803
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	804
				805	pyint = StackObject(
				806	name='int',
				807	obtype=int,
				808	doc="A short (as opposed to long) Python integer object.")
				809
				810	pylong = StackObject(
				811	name='long',
Guido van Rossum	e2a383d	2007-01-15 16:59:06 +0000	[diff] [blame]	812	obtype=int,
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	813	doc="A long (as opposed to short) Python integer object.")
				814
				815	pyinteger_or_bool = StackObject(
				816	name='int_or_bool',
Florent Xicluna	02ea12b2	2010-07-28 16:39:41 +0000	[diff] [blame]	817	obtype=(int, bool),
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	818	doc="A Python integer object (short or long), or "
				819	"a Python bool.")
				820
Guido van Rossum	5a2d8f5	2003-01-27 21:44:25 +0000	[diff] [blame]	821	pybool = StackObject(
				822	name='bool',
				823	obtype=(bool,),
				824	doc="A Python bool object.")
				825
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	826	pyfloat = StackObject(
				827	name='float',
				828	obtype=float,
				829	doc="A Python float object.")
				830
				831	pystring = StackObject(
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	832	name='string',
				833	obtype=bytes,
				834	doc="A Python (8-bit) string object.")
				835
				836	pybytes = StackObject(
Guido van Rossum	98297ee	2007-11-06 21:34:58 +0000	[diff] [blame]	837	name='bytes',
				838	obtype=bytes,
				839	doc="A Python bytes object.")
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	840
				841	pyunicode = StackObject(
Guido van Rossum	98297ee	2007-11-06 21:34:58 +0000	[diff] [blame]	842	name='str',
Guido van Rossum	ef87d6e	2007-05-02 19:09:54 +0000	[diff] [blame]	843	obtype=str,
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	844	doc="A Python (Unicode) string object.")
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	845
				846	pynone = StackObject(
				847	name="None",
				848	obtype=type(None),
				849	doc="The Python None object.")
				850
				851	pytuple = StackObject(
				852	name="tuple",
				853	obtype=tuple,
				854	doc="A Python tuple object.")
				855
				856	pylist = StackObject(
				857	name="list",
				858	obtype=list,
				859	doc="A Python list object.")
				860
				861	pydict = StackObject(
				862	name="dict",
				863	obtype=dict,
				864	doc="A Python dict object.")
				865
				866	anyobject = StackObject(
				867	name='any',
				868	obtype=object,
				869	doc="Any kind of object whatsoever.")
				870
				871	markobject = StackObject(
				872	name="mark",
				873	obtype=StackObject,
				874	doc="""'The mark' is a unique object.
				875
				876	Opcodes that operate on a variable number of objects
				877	generally don't embed the count of objects in the opcode,
				878	or pull it off the stack. Instead the MARK opcode is used
				879	to push a special marker object on the stack, and then
				880	some other opcodes grab all the objects from the top of
				881	the stack down to (but not including) the topmost marker
				882	object.
				883	""")
				884
				885	stackslice = StackObject(
				886	name="stackslice",
				887	obtype=StackObject,
				888	doc="""An object representing a contiguous slice of the stack.
				889
				890	This is used in conjuction with markobject, to represent all
				891	of the stack following the topmost markobject. For example,
				892	the POP_MARK opcode changes the stack from
				893
				894	[..., markobject, stackslice]
				895	to
				896	[...]
				897
				898	No matter how many object are on the stack after the topmost
				899	markobject, POP_MARK gets rid of all of them (including the
				900	topmost markobject too).
				901	""")
				902
				903	##############################################################################
				904	# Descriptors for pickle opcodes.
				905
				906	class OpcodeInfo(object):
				907
				908	__slots__ = (
				909	# symbolic name of opcode; a string
				910	'name',
				911
				912	# the code used in a bytestream to represent the opcode; a
				913	# one-character string
				914	'code',
				915
				916	# If the opcode has an argument embedded in the byte string, an
				917	# instance of ArgumentDescriptor specifying its type. Note that
				918	# arg.reader(s) can be used to read and decode the argument from
				919	# the bytestream s, and arg.doc documents the format of the raw
				920	# argument bytes. If the opcode doesn't have an argument embedded
				921	# in the bytestream, arg should be None.
				922	'arg',
				923
				924	# what the stack looks like before this opcode runs; a list
				925	'stack_before',
				926
				927	# what the stack looks like after this opcode runs; a list
				928	'stack_after',
				929
				930	# the protocol number in which this opcode was introduced; an int
				931	'proto',
				932
				933	# human-readable docs for this opcode; a string
				934	'doc',
				935	)
				936
				937	def __init__(self, name, code, arg,
				938	stack_before, stack_after, proto, doc):
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	939	assert isinstance(name, str)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	940	self.name = name
				941
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	942	assert isinstance(code, str)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	943	assert len(code) == 1
				944	self.code = code
				945
				946	assert arg is None or isinstance(arg, ArgumentDescriptor)
				947	self.arg = arg
				948
				949	assert isinstance(stack_before, list)
				950	for x in stack_before:
				951	assert isinstance(x, StackObject)
				952	self.stack_before = stack_before
				953
				954	assert isinstance(stack_after, list)
				955	for x in stack_after:
				956	assert isinstance(x, StackObject)
				957	self.stack_after = stack_after
				958
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	959	assert isinstance(proto, int) and 0 <= proto <= pickle.HIGHEST_PROTOCOL
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	960	self.proto = proto
				961
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	962	assert isinstance(doc, str)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	963	self.doc = doc
				964
				965	I = OpcodeInfo
				966	opcodes = [
				967
				968	# Ways to spell integers.
				969
				970	I(name='INT',
				971	code='I',
				972	arg=decimalnl_short,
				973	stack_before=[],
				974	stack_after=[pyinteger_or_bool],
				975	proto=0,
				976	doc="""Push an integer or bool.
				977
				978	The argument is a newline-terminated decimal literal string.
				979
				980	The intent may have been that this always fit in a short Python int,
				981	but INT can be generated in pickles written on a 64-bit box that
				982	require a Python long on a 32-bit box. The difference between this
				983	and LONG then is that INT skips a trailing 'L', and produces a short
				984	int whenever possible.
				985
				986	Another difference is due to that, when bool was introduced as a
				987	distinct type in 2.3, builtin names True and False were also added to
				988	2.2.2, mapping to ints 1 and 0. For compatibility in both directions,
				989	True gets pickled as INT + "I01\\n", and False as INT + "I00\\n".
				990	Leading zeroes are never produced for a genuine integer. The 2.3
				991	(and later) unpicklers special-case these and return bool instead;
				992	earlier unpicklers ignore the leading "0" and return the int.
				993	"""),
				994
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	995	I(name='BININT',
				996	code='J',
				997	arg=int4,
				998	stack_before=[],
				999	stack_after=[pyint],
				1000	proto=1,
				1001	doc="""Push a four-byte signed integer.
				1002
				1003	This handles the full range of Python (short) integers on a 32-bit
				1004	box, directly as binary bytes (1 for the opcode and 4 for the integer).
				1005	If the integer is non-negative and fits in 1 or 2 bytes, pickling via
				1006	BININT1 or BININT2 saves space.
				1007	"""),
				1008
				1009	I(name='BININT1',
				1010	code='K',
				1011	arg=uint1,
				1012	stack_before=[],
				1013	stack_after=[pyint],
				1014	proto=1,
				1015	doc="""Push a one-byte unsigned integer.
				1016
				1017	This is a space optimization for pickling very small non-negative ints,
				1018	in range(256).
				1019	"""),
				1020
				1021	I(name='BININT2',
				1022	code='M',
				1023	arg=uint2,
				1024	stack_before=[],
				1025	stack_after=[pyint],
				1026	proto=1,
				1027	doc="""Push a two-byte unsigned integer.
				1028
				1029	This is a space optimization for pickling small positive ints, in
				1030	range(256, 2**16). Integers in range(256) can also be pickled via
				1031	BININT2, but BININT1 instead saves a byte.
				1032	"""),
				1033
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1034	I(name='LONG',
				1035	code='L',
				1036	arg=decimalnl_long,
				1037	stack_before=[],
				1038	stack_after=[pylong],
				1039	proto=0,
				1040	doc="""Push a long integer.
				1041
				1042	The same as INT, except that the literal ends with 'L', and always
				1043	unpickles to a Python long. There doesn't seem a real purpose to the
				1044	trailing 'L'.
				1045
				1046	Note that LONG takes time quadratic in the number of digits when
				1047	unpickling (this is simply due to the nature of decimal->binary
				1048	conversion). Proto 2 added linear-time (in C; still quadratic-time
				1049	in Python) LONG1 and LONG4 opcodes.
				1050	"""),
				1051
				1052	I(name="LONG1",
				1053	code='\x8a',
				1054	arg=long1,
				1055	stack_before=[],
				1056	stack_after=[pylong],
				1057	proto=2,
				1058	doc="""Long integer using one-byte length.
				1059
				1060	A more efficient encoding of a Python long; the long1 encoding
				1061	says it all."""),
				1062
				1063	I(name="LONG4",
				1064	code='\x8b',
				1065	arg=long4,
				1066	stack_before=[],
				1067	stack_after=[pylong],
				1068	proto=2,
				1069	doc="""Long integer using found-byte length.
				1070
				1071	A more efficient encoding of a Python long; the long4 encoding
				1072	says it all."""),
				1073
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1074	# Ways to spell strings (8-bit, not Unicode).
				1075
				1076	I(name='STRING',
				1077	code='S',
				1078	arg=stringnl,
				1079	stack_before=[],
				1080	stack_after=[pystring],
				1081	proto=0,
				1082	doc="""Push a Python string object.
				1083
				1084	The argument is a repr-style string, with bracketing quote characters,
				1085	and perhaps embedded escapes. The argument extends until the next
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	1086	newline character. (Actually, they are decoded into a str instance
				1087	using the encoding given to the Unpickler constructor. or the default,
				1088	'ASCII'.)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1089	"""),
				1090
				1091	I(name='BINSTRING',
				1092	code='T',
				1093	arg=string4,
				1094	stack_before=[],
				1095	stack_after=[pystring],
				1096	proto=1,
				1097	doc="""Push a Python string object.
				1098
				1099	There are two arguments: the first is a 4-byte little-endian signed int
				1100	giving the number of bytes in the string, and the second is that many
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	1101	bytes, which are taken literally as the string content. (Actually,
				1102	they are decoded into a str instance using the encoding given to the
				1103	Unpickler constructor. or the default, 'ASCII'.)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1104	"""),
				1105
				1106	I(name='SHORT_BINSTRING',
				1107	code='U',
				1108	arg=string1,
				1109	stack_before=[],
				1110	stack_after=[pystring],
				1111	proto=1,
				1112	doc="""Push a Python string object.
				1113
				1114	There are two arguments: the first is a 1-byte unsigned int giving
				1115	the number of bytes in the string, and the second is that many bytes,
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	1116	which are taken literally as the string content. (Actually, they
				1117	are decoded into a str instance using the encoding given to the
				1118	Unpickler constructor. or the default, 'ASCII'.)
				1119	"""),
				1120
				1121	# Bytes (protocol 3 only; older protocols don't support bytes at all)
				1122
				1123	I(name='BINBYTES',
				1124	code='B',
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1125	arg=bytes4,
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	1126	stack_before=[],
				1127	stack_after=[pybytes],
				1128	proto=3,
				1129	doc="""Push a Python bytes object.
				1130
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1131	There are two arguments: the first is a 4-byte little-endian unsigned int
				1132	giving the number of bytes, and the second is that many bytes, which are
				1133	taken literally as the bytes content.
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	1134	"""),
				1135
				1136	I(name='SHORT_BINBYTES',
				1137	code='C',
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1138	arg=bytes1,
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	1139	stack_before=[],
				1140	stack_after=[pybytes],
Collin Winter	e61d437	2009-05-20 17:46:47 +0000	[diff] [blame]	1141	proto=3,
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1142	doc="""Push a Python bytes object.
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	1143
				1144	There are two arguments: the first is a 1-byte unsigned int giving
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1145	the number of bytes, and the second is that many bytes, which are taken
				1146	literally as the string content.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1147	"""),
				1148
				1149	# Ways to spell None.
				1150
				1151	I(name='NONE',
				1152	code='N',
				1153	arg=None,
				1154	stack_before=[],
				1155	stack_after=[pynone],
				1156	proto=0,
				1157	doc="Push None on the stack."),
				1158
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1159	# Ways to spell bools, starting with proto 2. See INT for how this was
				1160	# done before proto 2.
				1161
				1162	I(name='NEWTRUE',
				1163	code='\x88',
				1164	arg=None,
				1165	stack_before=[],
				1166	stack_after=[pybool],
				1167	proto=2,
				1168	doc="""True.
				1169
				1170	Push True onto the stack."""),
				1171
				1172	I(name='NEWFALSE',
				1173	code='\x89',
				1174	arg=None,
				1175	stack_before=[],
				1176	stack_after=[pybool],
				1177	proto=2,
				1178	doc="""True.
				1179
				1180	Push False onto the stack."""),
				1181
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1182	# Ways to spell Unicode strings.
				1183
				1184	I(name='UNICODE',
				1185	code='V',
				1186	arg=unicodestringnl,
				1187	stack_before=[],
				1188	stack_after=[pyunicode],
				1189	proto=0, # this may be pure-text, but it's a later addition
				1190	doc="""Push a Python Unicode string object.
				1191
				1192	The argument is a raw-unicode-escape encoding of a Unicode string,
				1193	and so may contain embedded escape sequences. The argument extends
				1194	until the next newline character.
				1195	"""),
				1196
				1197	I(name='BINUNICODE',
				1198	code='X',
				1199	arg=unicodestring4,
				1200	stack_before=[],
				1201	stack_after=[pyunicode],
				1202	proto=1,
				1203	doc="""Push a Python Unicode string object.
				1204
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1205	There are two arguments: the first is a 4-byte little-endian unsigned int
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1206	giving the number of bytes in the string. The second is that many
				1207	bytes, and is the UTF-8 encoding of the Unicode string.
				1208	"""),
				1209
				1210	# Ways to spell floats.
				1211
				1212	I(name='FLOAT',
				1213	code='F',
				1214	arg=floatnl,
				1215	stack_before=[],
				1216	stack_after=[pyfloat],
				1217	proto=0,
				1218	doc="""Newline-terminated decimal float literal.
				1219
				1220	The argument is repr(a_float), and in general requires 17 significant
				1221	digits for roundtrip conversion to be an identity (this is so for
				1222	IEEE-754 double precision values, which is what Python float maps to
				1223	on most boxes).
				1224
				1225	In general, FLOAT cannot be used to transport infinities, NaNs, or
				1226	minus zero across boxes (or even on a single box, if the platform C
				1227	library can't read the strings it produces for such things -- Windows
				1228	is like that), but may do less damage than BINFLOAT on boxes with
				1229	greater precision or dynamic range than IEEE-754 double.
				1230	"""),
				1231
				1232	I(name='BINFLOAT',
				1233	code='G',
				1234	arg=float8,
				1235	stack_before=[],
				1236	stack_after=[pyfloat],
				1237	proto=1,
				1238	doc="""Float stored in binary form, with 8 bytes of data.
				1239
				1240	This generally requires less than half the space of FLOAT encoding.
				1241	In general, BINFLOAT cannot be used to transport infinities, NaNs, or
				1242	minus zero, raises an exception if the exponent exceeds the range of
				1243	an IEEE-754 double, and retains no more than 53 bits of precision (if
				1244	there are more than that, "add a half and chop" rounding is used to
				1245	cut it back to 53 significant bits).
				1246	"""),
				1247
				1248	# Ways to build lists.
				1249
				1250	I(name='EMPTY_LIST',
				1251	code=']',
				1252	arg=None,
				1253	stack_before=[],
				1254	stack_after=[pylist],
				1255	proto=1,
				1256	doc="Push an empty list."),
				1257
				1258	I(name='APPEND',
				1259	code='a',
				1260	arg=None,
				1261	stack_before=[pylist, anyobject],
				1262	stack_after=[pylist],
				1263	proto=0,
				1264	doc="""Append an object to a list.
				1265
				1266	Stack before: ... pylist anyobject
				1267	Stack after: ... pylist+[anyobject]
Tim Peters	81098ac	2003-01-28 05:12:08 +0000	[diff] [blame]	1268
				1269	although pylist is really extended in-place.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1270	"""),
				1271
				1272	I(name='APPENDS',
				1273	code='e',
				1274	arg=None,
				1275	stack_before=[pylist, markobject, stackslice],
				1276	stack_after=[pylist],
				1277	proto=1,
				1278	doc="""Extend a list by a slice of stack objects.
				1279
				1280	Stack before: ... pylist markobject stackslice
				1281	Stack after: ... pylist+stackslice
Tim Peters	81098ac	2003-01-28 05:12:08 +0000	[diff] [blame]	1282
				1283	although pylist is really extended in-place.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1284	"""),
				1285
				1286	I(name='LIST',
				1287	code='l',
				1288	arg=None,
				1289	stack_before=[markobject, stackslice],
				1290	stack_after=[pylist],
				1291	proto=0,
				1292	doc="""Build a list out of the topmost stack slice, after markobject.
				1293
				1294	All the stack entries following the topmost markobject are placed into
				1295	a single Python list, which single list object replaces all of the
				1296	stack from the topmost markobject onward. For example,
				1297
				1298	Stack before: ... markobject 1 2 3 'abc'
				1299	Stack after: ... [1, 2, 3, 'abc']
				1300	"""),
				1301
				1302	# Ways to build tuples.
				1303
				1304	I(name='EMPTY_TUPLE',
				1305	code=')',
				1306	arg=None,
				1307	stack_before=[],
				1308	stack_after=[pytuple],
				1309	proto=1,
				1310	doc="Push an empty tuple."),
				1311
				1312	I(name='TUPLE',
				1313	code='t',
				1314	arg=None,
				1315	stack_before=[markobject, stackslice],
				1316	stack_after=[pytuple],
				1317	proto=0,
				1318	doc="""Build a tuple out of the topmost stack slice, after markobject.
				1319
				1320	All the stack entries following the topmost markobject are placed into
				1321	a single Python tuple, which single tuple object replaces all of the
				1322	stack from the topmost markobject onward. For example,
				1323
				1324	Stack before: ... markobject 1 2 3 'abc'
				1325	Stack after: ... (1, 2, 3, 'abc')
				1326	"""),
				1327
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1328	I(name='TUPLE1',
				1329	code='\x85',
				1330	arg=None,
				1331	stack_before=[anyobject],
				1332	stack_after=[pytuple],
				1333	proto=2,
Alexander Belopolsky	44c2ffd	2010-07-16 14:39:45 +0000	[diff] [blame]	1334	doc="""Build a one-tuple out of the topmost item on the stack.
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1335
				1336	This code pops one value off the stack and pushes a tuple of
Alexander Belopolsky	44c2ffd	2010-07-16 14:39:45 +0000	[diff] [blame]	1337	length 1 whose one item is that value back onto it. In other
				1338	words:
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1339
				1340	stack[-1] = tuple(stack[-1:])
				1341	"""),
				1342
				1343	I(name='TUPLE2',
				1344	code='\x86',
				1345	arg=None,
				1346	stack_before=[anyobject, anyobject],
				1347	stack_after=[pytuple],
				1348	proto=2,
Alexander Belopolsky	44c2ffd	2010-07-16 14:39:45 +0000	[diff] [blame]	1349	doc="""Build a two-tuple out of the top two items on the stack.
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1350
Alexander Belopolsky	44c2ffd	2010-07-16 14:39:45 +0000	[diff] [blame]	1351	This code pops two values off the stack and pushes a tuple of
				1352	length 2 whose items are those values back onto it. In other
				1353	words:
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1354
				1355	stack[-2:] = [tuple(stack[-2:])]
				1356	"""),
				1357
				1358	I(name='TUPLE3',
				1359	code='\x87',
				1360	arg=None,
				1361	stack_before=[anyobject, anyobject, anyobject],
				1362	stack_after=[pytuple],
				1363	proto=2,
Alexander Belopolsky	44c2ffd	2010-07-16 14:39:45 +0000	[diff] [blame]	1364	doc="""Build a three-tuple out of the top three items on the stack.
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1365
Alexander Belopolsky	44c2ffd	2010-07-16 14:39:45 +0000	[diff] [blame]	1366	This code pops three values off the stack and pushes a tuple of
				1367	length 3 whose items are those values back onto it. In other
				1368	words:
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1369
				1370	stack[-3:] = [tuple(stack[-3:])]
				1371	"""),
				1372
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1373	# Ways to build dicts.
				1374
				1375	I(name='EMPTY_DICT',
				1376	code='}',
				1377	arg=None,
				1378	stack_before=[],
				1379	stack_after=[pydict],
				1380	proto=1,
				1381	doc="Push an empty dict."),
				1382
				1383	I(name='DICT',
				1384	code='d',
				1385	arg=None,
				1386	stack_before=[markobject, stackslice],
				1387	stack_after=[pydict],
				1388	proto=0,
				1389	doc="""Build a dict out of the topmost stack slice, after markobject.
				1390
				1391	All the stack entries following the topmost markobject are placed into
				1392	a single Python dict, which single dict object replaces all of the
				1393	stack from the topmost markobject onward. The stack slice alternates
				1394	key, value, key, value, .... For example,
				1395
				1396	Stack before: ... markobject 1 2 3 'abc'
				1397	Stack after: ... {1: 2, 3: 'abc'}
				1398	"""),
				1399
				1400	I(name='SETITEM',
				1401	code='s',
				1402	arg=None,
				1403	stack_before=[pydict, anyobject, anyobject],
				1404	stack_after=[pydict],
				1405	proto=0,
				1406	doc="""Add a key+value pair to an existing dict.
				1407
				1408	Stack before: ... pydict key value
				1409	Stack after: ... pydict
				1410
				1411	where pydict has been modified via pydict[key] = value.
				1412	"""),
				1413
				1414	I(name='SETITEMS',
				1415	code='u',
				1416	arg=None,
				1417	stack_before=[pydict, markobject, stackslice],
				1418	stack_after=[pydict],
				1419	proto=1,
				1420	doc="""Add an arbitrary number of key+value pairs to an existing dict.
				1421
				1422	The slice of the stack following the topmost markobject is taken as
				1423	an alternating sequence of keys and values, added to the dict
				1424	immediately under the topmost markobject. Everything at and after the
				1425	topmost markobject is popped, leaving the mutated dict at the top
				1426	of the stack.
				1427
				1428	Stack before: ... pydict markobject key_1 value_1 ... key_n value_n
				1429	Stack after: ... pydict
				1430
				1431	where pydict has been modified via pydict[key_i] = value_i for i in
				1432	1, 2, ..., n, and in that order.
				1433	"""),
				1434
				1435	# Stack manipulation.
				1436
				1437	I(name='POP',
				1438	code='0',
				1439	arg=None,
				1440	stack_before=[anyobject],
				1441	stack_after=[],
				1442	proto=0,
				1443	doc="Discard the top stack item, shrinking the stack by one item."),
				1444
				1445	I(name='DUP',
				1446	code='2',
				1447	arg=None,
				1448	stack_before=[anyobject],
				1449	stack_after=[anyobject, anyobject],
				1450	proto=0,
				1451	doc="Push the top stack item onto the stack again, duplicating it."),
				1452
				1453	I(name='MARK',
				1454	code='(',
				1455	arg=None,
				1456	stack_before=[],
				1457	stack_after=[markobject],
				1458	proto=0,
				1459	doc="""Push markobject onto the stack.
				1460
				1461	markobject is a unique object, used by other opcodes to identify a
				1462	region of the stack containing a variable number of objects for them
				1463	to work on. See markobject.doc for more detail.
				1464	"""),
				1465
				1466	I(name='POP_MARK',
				1467	code='1',
				1468	arg=None,
				1469	stack_before=[markobject, stackslice],
				1470	stack_after=[],
Collin Winter	e61d437	2009-05-20 17:46:47 +0000	[diff] [blame]	1471	proto=1,
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1472	doc="""Pop all the stack objects at and above the topmost markobject.
				1473
				1474	When an opcode using a variable number of stack objects is done,
				1475	POP_MARK is used to remove those objects, and to remove the markobject
				1476	that delimited their starting position on the stack.
				1477	"""),
				1478
				1479	# Memo manipulation. There are really only two operations (get and put),
				1480	# each in all-text, "short binary", and "long binary" flavors.
				1481
				1482	I(name='GET',
				1483	code='g',
				1484	arg=decimalnl_short,
				1485	stack_before=[],
				1486	stack_after=[anyobject],
				1487	proto=0,
				1488	doc="""Read an object from the memo and push it on the stack.
				1489
Ezio Melotti	1392500	2011-03-16 11:05:33 +0200	[diff] [blame]	1490	The index of the memo object to push is given by the newline-terminated
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1491	decimal string following. BINGET and LONG_BINGET are space-optimized
				1492	versions.
				1493	"""),
				1494
				1495	I(name='BINGET',
				1496	code='h',
				1497	arg=uint1,
				1498	stack_before=[],
				1499	stack_after=[anyobject],
				1500	proto=1,
				1501	doc="""Read an object from the memo and push it on the stack.
				1502
				1503	The index of the memo object to push is given by the 1-byte unsigned
				1504	integer following.
				1505	"""),
				1506
				1507	I(name='LONG_BINGET',
				1508	code='j',
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1509	arg=uint4,
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1510	stack_before=[],
				1511	stack_after=[anyobject],
				1512	proto=1,
				1513	doc="""Read an object from the memo and push it on the stack.
				1514
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1515	The index of the memo object to push is given by the 4-byte unsigned
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1516	little-endian integer following.
				1517	"""),
				1518
				1519	I(name='PUT',
				1520	code='p',
				1521	arg=decimalnl_short,
				1522	stack_before=[],
				1523	stack_after=[],
				1524	proto=0,
				1525	doc="""Store the stack top into the memo. The stack is not popped.
				1526
				1527	The index of the memo location to write into is given by the newline-
				1528	terminated decimal string following. BINPUT and LONG_BINPUT are
				1529	space-optimized versions.
				1530	"""),
				1531
				1532	I(name='BINPUT',
				1533	code='q',
				1534	arg=uint1,
				1535	stack_before=[],
				1536	stack_after=[],
				1537	proto=1,
				1538	doc="""Store the stack top into the memo. The stack is not popped.
				1539
				1540	The index of the memo location to write into is given by the 1-byte
				1541	unsigned integer following.
				1542	"""),
				1543
				1544	I(name='LONG_BINPUT',
				1545	code='r',
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1546	arg=uint4,
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1547	stack_before=[],
				1548	stack_after=[],
				1549	proto=1,
				1550	doc="""Store the stack top into the memo. The stack is not popped.
				1551
				1552	The index of the memo location to write into is given by the 4-byte
Alexandre Vassalotti	8db89ca	2013-04-14 03:30:35 -0700	[diff] [blame]	1553	unsigned little-endian integer following.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1554	"""),
				1555
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1556	# Access the extension registry (predefined objects). Akin to the GET
				1557	# family.
				1558
				1559	I(name='EXT1',
				1560	code='\x82',
				1561	arg=uint1,
				1562	stack_before=[],
				1563	stack_after=[anyobject],
				1564	proto=2,
				1565	doc="""Extension code.
				1566
				1567	This code and the similar EXT2 and EXT4 allow using a registry
				1568	of popular objects that are pickled by name, typically classes.
				1569	It is envisioned that through a global negotiation and
				1570	registration process, third parties can set up a mapping between
				1571	ints and object names.
				1572
				1573	In order to guarantee pickle interchangeability, the extension
				1574	code registry ought to be global, although a range of codes may
				1575	be reserved for private use.
				1576
				1577	EXT1 has a 1-byte integer argument. This is used to index into the
				1578	extension registry, and the object at that index is pushed on the stack.
				1579	"""),
				1580
				1581	I(name='EXT2',
				1582	code='\x83',
				1583	arg=uint2,
				1584	stack_before=[],
				1585	stack_after=[anyobject],
				1586	proto=2,
				1587	doc="""Extension code.
				1588
				1589	See EXT1. EXT2 has a two-byte integer argument.
				1590	"""),
				1591
				1592	I(name='EXT4',
				1593	code='\x84',
				1594	arg=int4,
				1595	stack_before=[],
				1596	stack_after=[anyobject],
				1597	proto=2,
				1598	doc="""Extension code.
				1599
				1600	See EXT1. EXT4 has a four-byte integer argument.
				1601	"""),
				1602
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1603	# Push a class object, or module function, on the stack, via its module
				1604	# and name.
				1605
				1606	I(name='GLOBAL',
				1607	code='c',
				1608	arg=stringnl_noescape_pair,
				1609	stack_before=[],
				1610	stack_after=[anyobject],
				1611	proto=0,
				1612	doc="""Push a global object (module.attr) on the stack.
				1613
				1614	Two newline-terminated strings follow the GLOBAL opcode. The first is
				1615	taken as a module name, and the second as a class name. The class
				1616	object module.class is pushed on the stack. More accurately, the
				1617	object returned by self.find_class(module, class) is pushed on the
				1618	stack, so unpickling subclasses can override this form of lookup.
				1619	"""),
				1620
				1621	# Ways to build objects of classes pickle doesn't know about directly
				1622	# (user-defined classes). I despair of documenting this accurately
				1623	# and comprehensibly -- you really have to read the pickle code to
				1624	# find all the special cases.
				1625
				1626	I(name='REDUCE',
				1627	code='R',
				1628	arg=None,
				1629	stack_before=[anyobject, anyobject],
				1630	stack_after=[anyobject],
				1631	proto=0,
				1632	doc="""Push an object built from a callable and an argument tuple.
				1633
				1634	The opcode is named to remind of the __reduce__() method.
				1635
				1636	Stack before: ... callable pytuple
				1637	Stack after: ... callable(*pytuple)
				1638
				1639	The callable and the argument tuple are the first two items returned
				1640	by a __reduce__ method. Applying the callable to the argtuple is
				1641	supposed to reproduce the original object, or at least get it started.
				1642	If the __reduce__ method returns a 3-tuple, the last component is an
				1643	argument to be passed to the object's __setstate__, and then the REDUCE
				1644	opcode is followed by code to create setstate's argument, and then a
				1645	BUILD opcode to apply __setstate__ to that argument.
				1646
Guido van Rossum	1325790	2007-06-07 23:15:56 +0000	[diff] [blame]	1647	If not isinstance(callable, type), REDUCE complains unless the
Alexandre Vassalotti	f7fa63d	2008-05-11 08:55:36 +0000	[diff] [blame]	1648	callable has been registered with the copyreg module's
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1649	safe_constructors dict, or the callable has a magic
				1650	'__safe_for_unpickling__' attribute with a true value. I'm not sure
				1651	why it does this, but I've sure seen this complaint often enough when
				1652	I didn't want to <wink>.
				1653	"""),
				1654
				1655	I(name='BUILD',
				1656	code='b',
				1657	arg=None,
				1658	stack_before=[anyobject, anyobject],
				1659	stack_after=[anyobject],
				1660	proto=0,
				1661	doc="""Finish building an object, via __setstate__ or dict update.
				1662
				1663	Stack before: ... anyobject argument
				1664	Stack after: ... anyobject
				1665
				1666	where anyobject may have been mutated, as follows:
				1667
				1668	If the object has a __setstate__ method,
				1669
				1670	anyobject.__setstate__(argument)
				1671
				1672	is called.
				1673
				1674	Else the argument must be a dict, the object must have a __dict__, and
				1675	the object is updated via
				1676
				1677	anyobject.__dict__.update(argument)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1678	"""),
				1679
				1680	I(name='INST',
				1681	code='i',
				1682	arg=stringnl_noescape_pair,
				1683	stack_before=[markobject, stackslice],
				1684	stack_after=[anyobject],
				1685	proto=0,
				1686	doc="""Build a class instance.
				1687
				1688	This is the protocol 0 version of protocol 1's OBJ opcode.
				1689	INST is followed by two newline-terminated strings, giving a
				1690	module and class name, just as for the GLOBAL opcode (and see
				1691	GLOBAL for more details about that). self.find_class(module, name)
				1692	is used to get a class object.
				1693
				1694	In addition, all the objects on the stack following the topmost
				1695	markobject are gathered into a tuple and popped (along with the
				1696	topmost markobject), just as for the TUPLE opcode.
				1697
				1698	Now it gets complicated. If all of these are true:
				1699
				1700	+ The argtuple is empty (markobject was at the top of the stack
				1701	at the start).
				1702
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1703	+ The class object does not have a __getinitargs__ attribute.
				1704
				1705	then we want to create an old-style class instance without invoking
				1706	its __init__() method (pickle has waffled on this over the years; not
				1707	calling __init__() is current wisdom). In this case, an instance of
				1708	an old-style dummy class is created, and then we try to rebind its
				1709	__class__ attribute to the desired class object. If this succeeds,
Guido van Rossum	a8add0e	2007-05-14 22:03:55 +0000	[diff] [blame]	1710	the new instance object is pushed on the stack, and we're done.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1711
				1712	Else (the argtuple is not empty, it's not an old-style class object,
				1713	or the class object does have a __getinitargs__ attribute), the code
				1714	first insists that the class object have a __safe_for_unpickling__
				1715	attribute. Unlike as for the __safe_for_unpickling__ check in REDUCE,
				1716	it doesn't matter whether this attribute has a true or false value, it
Guido van Rossum	99603b0	2007-07-20 00:22:32 +0000	[diff] [blame]	1717	only matters whether it exists (XXX this is a bug). If
				1718	__safe_for_unpickling__ doesn't exist, UnpicklingError is raised.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1719
				1720	Else (the class object does have a __safe_for_unpickling__ attr),
				1721	the class object obtained from INST's arguments is applied to the
				1722	argtuple obtained from the stack, and the resulting instance object
				1723	is pushed on the stack.
Tim Peters	2b93c4c	2003-01-30 16:35:08 +0000	[diff] [blame]	1724
				1725	NOTE: checks for __safe_for_unpickling__ went away in Python 2.3.
Florent Xicluna	aa6c1d2	2011-12-12 18:54:29 +0100	[diff] [blame]	1726	NOTE: the distinction between old-style and new-style classes does
				1727	not make sense in Python 3.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1728	"""),
				1729
				1730	I(name='OBJ',
				1731	code='o',
				1732	arg=None,
				1733	stack_before=[markobject, anyobject, stackslice],
				1734	stack_after=[anyobject],
				1735	proto=1,
				1736	doc="""Build a class instance.
				1737
				1738	This is the protocol 1 version of protocol 0's INST opcode, and is
				1739	very much like it. The major difference is that the class object
				1740	is taken off the stack, allowing it to be retrieved from the memo
				1741	repeatedly if several instances of the same class are created. This
				1742	can be much more efficient (in both time and space) than repeatedly
				1743	embedding the module and class names in INST opcodes.
				1744
				1745	Unlike INST, OBJ takes no arguments from the opcode stream. Instead
				1746	the class object is taken off the stack, immediately above the
				1747	topmost markobject:
				1748
				1749	Stack before: ... markobject classobject stackslice
				1750	Stack after: ... new_instance_object
				1751
				1752	As for INST, the remainder of the stack above the markobject is
				1753	gathered into an argument tuple, and then the logic seems identical,
Guido van Rossum	ecb1104	2003-01-29 06:24:30 +0000	[diff] [blame]	1754	except that no __safe_for_unpickling__ check is done (XXX this is
Guido van Rossum	99603b0	2007-07-20 00:22:32 +0000	[diff] [blame]	1755	a bug). See INST for the gory details.
Tim Peters	2b93c4c	2003-01-30 16:35:08 +0000	[diff] [blame]	1756
				1757	NOTE: In Python 2.3, INST and OBJ are identical except for how they
				1758	get the class object. That was always the intent; the implementations
				1759	had diverged for accidental reasons.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1760	"""),
				1761
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1762	I(name='NEWOBJ',
				1763	code='\x81',
				1764	arg=None,
				1765	stack_before=[anyobject, anyobject],
				1766	stack_after=[anyobject],
				1767	proto=2,
				1768	doc="""Build an object instance.
				1769
				1770	The stack before should be thought of as containing a class
				1771	object followed by an argument tuple (the tuple being the stack
				1772	top). Call these cls and args. They are popped off the stack,
				1773	and the value returned by cls.__new__(cls, *args) is pushed back
				1774	onto the stack.
				1775	"""),
				1776
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1777	# Machine control.
				1778
Tim Peters	fdc0346	2003-01-28 04:56:33 +0000	[diff] [blame]	1779	I(name='PROTO',
				1780	code='\x80',
				1781	arg=uint1,
				1782	stack_before=[],
				1783	stack_after=[],
				1784	proto=2,
				1785	doc="""Protocol version indicator.
				1786
				1787	For protocol 2 and above, a pickle must start with this opcode.
				1788	The argument is the protocol version, an int in range(2, 256).
				1789	"""),
				1790
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1791	I(name='STOP',
				1792	code='.',
				1793	arg=None,
				1794	stack_before=[anyobject],
				1795	stack_after=[],
				1796	proto=0,
				1797	doc="""Stop the unpickling machine.
				1798
				1799	Every pickle ends with this opcode. The object at the top of the stack
				1800	is popped, and that's the result of unpickling. The stack should be
				1801	empty then.
				1802	"""),
				1803
				1804	# Ways to deal with persistent IDs.
				1805
				1806	I(name='PERSID',
				1807	code='P',
				1808	arg=stringnl_noescape,
				1809	stack_before=[],
				1810	stack_after=[anyobject],
				1811	proto=0,
				1812	doc="""Push an object identified by a persistent ID.
				1813
				1814	The pickle module doesn't define what a persistent ID means. PERSID's
				1815	argument is a newline-terminated str-style (no embedded escapes, no
				1816	bracketing quote characters) string, which is "the persistent ID".
				1817	The unpickler passes this string to self.persistent_load(). Whatever
				1818	object that returns is pushed on the stack. There is no implementation
				1819	of persistent_load() in Python's unpickler: it must be supplied by an
				1820	unpickler subclass.
				1821	"""),
				1822
				1823	I(name='BINPERSID',
				1824	code='Q',
				1825	arg=None,
				1826	stack_before=[anyobject],
				1827	stack_after=[anyobject],
				1828	proto=1,
				1829	doc="""Push an object identified by a persistent ID.
				1830
				1831	Like PERSID, except the persistent ID is popped off the stack (instead
				1832	of being a string embedded in the opcode bytestream). The persistent
				1833	ID is passed to self.persistent_load(), and whatever object that
				1834	returns is pushed on the stack. See PERSID for more detail.
				1835	"""),
				1836	]
				1837	del I
				1838
				1839	# Verify uniqueness of .name and .code members.
				1840	name2i = {}
				1841	code2i = {}
				1842
				1843	for i, d in enumerate(opcodes):
				1844	if d.name in name2i:
				1845	raise ValueError("repeated name %r at indices %d and %d" %
				1846	(d.name, name2i[d.name], i))
				1847	if d.code in code2i:
				1848	raise ValueError("repeated code %r at indices %d and %d" %
				1849	(d.code, code2i[d.code], i))
				1850
				1851	name2i[d.name] = i
				1852	code2i[d.code] = i
				1853
				1854	del name2i, code2i, i, d
				1855
				1856	##############################################################################
				1857	# Build a code2op dict, mapping opcode characters to OpcodeInfo records.
				1858	# Also ensure we've got the same stuff as pickle.py, although the
				1859	# introspection here is dicey.
				1860
				1861	code2op = {}
				1862	for d in opcodes:
				1863	code2op[d.code] = d
				1864	del d
				1865
				1866	def assure_pickle_consistency(verbose=False):
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1867
				1868	copy = code2op.copy()
				1869	for name in pickle.__all__:
				1870	if not re.match("[A-Z][A-Z0-9_]+$", name):
				1871	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	1872	print("skipping %r: it doesn't look like an opcode name" % name)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1873	continue
				1874	picklecode = getattr(pickle, name)
Guido van Rossum	617dbc4	2007-05-07 23:57:08 +0000	[diff] [blame]	1875	if not isinstance(picklecode, bytes) or len(picklecode) != 1:
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1876	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	1877	print(("skipping %r: value %r doesn't look like a pickle "
				1878	"code" % (name, picklecode)))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1879	continue
Guido van Rossum	617dbc4	2007-05-07 23:57:08 +0000	[diff] [blame]	1880	picklecode = picklecode.decode("latin-1")
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1881	if picklecode in copy:
				1882	if verbose:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	1883	print("checking name %r w/ code %r for consistency" % (
				1884	name, picklecode))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1885	d = copy[picklecode]
				1886	if d.name != name:
				1887	raise ValueError("for pickle code %r, pickle.py uses name %r "
				1888	"but we're using name %r" % (picklecode,
				1889	name,
				1890	d.name))
				1891	# Forget this one. Any left over in copy at the end are a problem
				1892	# of a different kind.
				1893	del copy[picklecode]
				1894	else:
				1895	raise ValueError("pickle.py appears to have a pickle opcode with "
				1896	"name %r and code %r, but we don't" %
				1897	(name, picklecode))
				1898	if copy:
				1899	msg = ["we appear to have pickle opcodes that pickle.py doesn't have:"]
				1900	for code, d in copy.items():
				1901	msg.append(" name %r with code %r" % (d.name, code))
				1902	raise ValueError("\n".join(msg))
				1903
				1904	assure_pickle_consistency()
Tim Peters	c0c12b5	2003-01-29 00:56:17 +0000	[diff] [blame]	1905	del assure_pickle_consistency
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1906
				1907	##############################################################################
				1908	# A pickle opcode generator.
				1909
				1910	def genops(pickle):
Guido van Rossum	a72ded9	2003-01-27 19:40:47 +0000	[diff] [blame]	1911	"""Generate all the opcodes in a pickle.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1912
				1913	'pickle' is a file-like object, or string, containing the pickle.
				1914
				1915	Each opcode in the pickle is generated, from the current pickle position,
				1916	stopping after a STOP opcode is delivered. A triple is generated for
				1917	each opcode:
				1918
				1919	opcode, arg, pos
				1920
				1921	opcode is an OpcodeInfo record, describing the current opcode.
				1922
				1923	If the opcode has an argument embedded in the pickle, arg is its decoded
				1924	value, as a Python object. If the opcode doesn't have an argument, arg
				1925	is None.
				1926
				1927	If the pickle has a tell() method, pos was the value of pickle.tell()
Guido van Rossum	34d1928	2007-08-09 01:03:29 +0000	[diff] [blame]	1928	before reading the current opcode. If the pickle is a bytes object,
				1929	it's wrapped in a BytesIO object, and the latter's tell() result is
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1930	used. Else (the pickle doesn't have a tell(), and it's not obvious how
				1931	to query its current position) pos is None.
				1932	"""
				1933
Guido van Rossum	98297ee	2007-11-06 21:34:58 +0000	[diff] [blame]	1934	if isinstance(pickle, bytes_types):
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	1935	import io
				1936	pickle = io.BytesIO(pickle)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1937
				1938	if hasattr(pickle, "tell"):
				1939	getpos = pickle.tell
				1940	else:
				1941	getpos = lambda: None
				1942
				1943	while True:
				1944	pos = getpos()
				1945	code = pickle.read(1)
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	1946	opcode = code2op.get(code.decode("latin-1"))
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1947	if opcode is None:
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	1948	if code == b"":
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1949	raise ValueError("pickle exhausted before seeing STOP")
				1950	else:
				1951	raise ValueError("at position %s, opcode %r unknown" % (
				1952	pos is None and "<unknown>" or pos,
				1953	code))
				1954	if opcode.arg is None:
				1955	arg = None
				1956	else:
				1957	arg = opcode.arg.reader(pickle)
				1958	yield opcode, arg, pos
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	1959	if code == b'.':
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1960	assert opcode.name == 'STOP'
				1961	break
				1962
				1963	##############################################################################
Christian Heimes	3feef61	2008-02-11 06:19:17 +0000	[diff] [blame]	1964	# A pickle optimizer.
				1965
				1966	def optimize(p):
				1967	'Optimize a pickle string by removing unused PUT opcodes'
				1968	gets = set() # set of args used by a GET opcode
				1969	puts = [] # (arg, startpos, stoppos) for the PUT opcodes
				1970	prevpos = None # set to pos if previous opcode was a PUT
				1971	for opcode, arg, pos in genops(p):
				1972	if prevpos is not None:
				1973	puts.append((prevarg, prevpos, pos))
				1974	prevpos = None
				1975	if 'PUT' in opcode.name:
				1976	prevarg, prevpos = arg, pos
				1977	elif 'GET' in opcode.name:
				1978	gets.add(arg)
				1979
				1980	# Copy the pickle string except for PUTS without a corresponding GET
				1981	s = []
				1982	i = 0
				1983	for arg, start, stop in puts:
				1984	j = stop if (arg in gets) else start
				1985	s.append(p[i:j])
				1986	i = stop
				1987	s.append(p[i:])
Christian Heimes	126d29a	2008-02-11 22:57:17 +0000	[diff] [blame]	1988	return b''.join(s)
Christian Heimes	3feef61	2008-02-11 06:19:17 +0000	[diff] [blame]	1989
				1990	##############################################################################
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1991	# A symbolic pickle disassembler.
				1992
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	1993	def dis(pickle, out=None, memo=None, indentlevel=4, annotate=0):
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	1994	"""Produce a symbolic disassembly of a pickle.
				1995
				1996	'pickle' is a file-like object, or string, containing a (at least one)
				1997	pickle. The pickle is disassembled from the current position, through
				1998	the first STOP opcode encountered.
				1999
				2000	Optional arg 'out' is a file-like object to which the disassembly is
				2001	printed. It defaults to sys.stdout.
				2002
Tim Peters	62235e7	2003-02-05 19:55:53 +0000	[diff] [blame]	2003	Optional arg 'memo' is a Python dict, used as the pickle's memo. It
				2004	may be mutated by dis(), if the pickle contains PUT or BINPUT opcodes.
				2005	Passing the same memo object to another dis() call then allows disassembly
				2006	to proceed across multiple pickles that were all created by the same
				2007	pickler with the same memo. Ordinarily you don't need to worry about this.
				2008
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	2009	Optional arg 'indentlevel' is the number of blanks by which to indent
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2010	a new MARK level. It defaults to 4.
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2011
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	2012	Optional arg 'annotate' if nonzero instructs dis() to add short
				2013	description of the opcode on each line of disassembled output.
				2014	The value given to 'annotate' must be an integer and is used as a
				2015	hint for the column where annotation should start. The default
				2016	value is 0, meaning no annotations.
				2017
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2018	In addition to printing the disassembly, some sanity checks are made:
				2019
				2020	+ All embedded opcode arguments "make sense".
				2021
				2022	+ Explicit and implicit pop operations have enough items on the stack.
				2023
				2024	+ When an opcode implicitly refers to a markobject, a markobject is
				2025	actually on the stack.
				2026
				2027	+ A memo entry isn't referenced before it's defined.
				2028
				2029	+ The markobject isn't stored in the memo.
				2030
				2031	+ A memo entry isn't redefined.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2032	"""
				2033
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2034	# Most of the hair here is for sanity checks, but most of it is needed
				2035	# anyway to detect when a protocol 0 POP takes a MARK off the stack
				2036	# (which in turn is needed to indent MARK blocks correctly).
				2037
				2038	stack = [] # crude emulation of unpickler stack
Tim Peters	62235e7	2003-02-05 19:55:53 +0000	[diff] [blame]	2039	if memo is None:
				2040	memo = {} # crude emulation of unpicker memo
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2041	maxproto = -1 # max protocol number seen
				2042	markstack = [] # bytecode positions of MARK opcodes
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2043	indentchunk = ' ' * indentlevel
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2044	errormsg = None
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	2045	annocol = annotate # columnt hint for annotations
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2046	for opcode, arg, pos in genops(pickle):
				2047	if pos is not None:
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	2048	print("%5d:" % pos, end=' ', file=out)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2049
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2050	line = "%-4s %s%s" % (repr(opcode.code)[1:-1],
				2051	indentchunk * len(markstack),
				2052	opcode.name)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2053
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2054	maxproto = max(maxproto, opcode.proto)
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2055	before = opcode.stack_before # don't mutate
				2056	after = opcode.stack_after # don't mutate
Tim Peters	43277d6	2003-01-30 15:02:12 +0000	[diff] [blame]	2057	numtopop = len(before)
				2058
				2059	# See whether a MARK should be popped.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2060	markmsg = None
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2061	if markobject in before or (opcode.name == "POP" and
				2062	stack and
				2063	stack[-1] is markobject):
				2064	assert markobject not in after
Tim Peters	43277d6	2003-01-30 15:02:12 +0000	[diff] [blame]	2065	if __debug__:
				2066	if markobject in before:
				2067	assert before[-1] is stackslice
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2068	if markstack:
				2069	markpos = markstack.pop()
				2070	if markpos is None:
				2071	markmsg = "(MARK at unknown opcode offset)"
				2072	else:
				2073	markmsg = "(MARK at %d)" % markpos
				2074	# Pop everything at and after the topmost markobject.
				2075	while stack[-1] is not markobject:
				2076	stack.pop()
				2077	stack.pop()
Tim Peters	43277d6	2003-01-30 15:02:12 +0000	[diff] [blame]	2078	# Stop later code from popping too much.
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2079	try:
Tim Peters	43277d6	2003-01-30 15:02:12 +0000	[diff] [blame]	2080	numtopop = before.index(markobject)
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2081	except ValueError:
				2082	assert opcode.name == "POP"
Tim Peters	43277d6	2003-01-30 15:02:12 +0000	[diff] [blame]	2083	numtopop = 0
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2084	else:
				2085	errormsg = markmsg = "no MARK exists on stack"
				2086
				2087	# Check for correct memo usage.
				2088	if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT"):
Tim Peters	43277d6	2003-01-30 15:02:12 +0000	[diff] [blame]	2089	assert arg is not None
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2090	if arg in memo:
				2091	errormsg = "memo key %r already defined" % arg
				2092	elif not stack:
				2093	errormsg = "stack is empty -- can't store into memo"
				2094	elif stack[-1] is markobject:
				2095	errormsg = "can't store markobject in the memo"
				2096	else:
				2097	memo[arg] = stack[-1]
				2098
				2099	elif opcode.name in ("GET", "BINGET", "LONG_BINGET"):
				2100	if arg in memo:
				2101	assert len(after) == 1
				2102	after = [memo[arg]] # for better stack emulation
				2103	else:
				2104	errormsg = "memo key %r has never been stored into" % arg
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2105
				2106	if arg is not None or markmsg:
				2107	# make a mild effort to align arguments
				2108	line += ' ' * (10 - len(opcode.name))
				2109	if arg is not None:
				2110	line += ' ' + repr(arg)
				2111	if markmsg:
				2112	line += ' ' + markmsg
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	2113	if annotate:
				2114	line += ' ' * (annocol - len(line))
				2115	# make a mild effort to align annotations
				2116	annocol = len(line)
				2117	if annocol > 50:
				2118	annocol = annotate
				2119	line += ' ' + opcode.doc.split('\n', 1)[0]
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	2120	print(line, file=out)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2121
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2122	if errormsg:
				2123	# Note that we delayed complaining until the offending opcode
				2124	# was printed.
				2125	raise ValueError(errormsg)
				2126
				2127	# Emulate the stack effects.
Tim Peters	43277d6	2003-01-30 15:02:12 +0000	[diff] [blame]	2128	if len(stack) < numtopop:
				2129	raise ValueError("tries to pop %d items from stack with "
				2130	"only %d items" % (numtopop, len(stack)))
				2131	if numtopop:
				2132	del stack[-numtopop:]
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2133	if markobject in after:
Tim Peters	43277d6	2003-01-30 15:02:12 +0000	[diff] [blame]	2134	assert markobject not in before
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2135	markstack.append(pos)
				2136
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2137	stack.extend(after)
				2138
Guido van Rossum	be19ed7	2007-02-09 05:37:30 +0000	[diff] [blame]	2139	print("highest protocol among opcodes =", maxproto, file=out)
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2140	if stack:
				2141	raise ValueError("stack not empty after STOP: %r" % stack)
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2142
Tim Peters	90718a4	2005-02-15 16:22:34 +0000	[diff] [blame]	2143	# For use in the doctest, simply as an example of a class to pickle.
				2144	class _Example:
				2145	def __init__(self, value):
				2146	self.value = value
				2147
Guido van Rossum	03e3532	2003-01-28 15:37:13 +0000	[diff] [blame]	2148	_dis_test = r"""
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2149	>>> import pickle
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	2150	>>> x = [1, 2, (3, 4), {b'abc': "def"}]
				2151	>>> pkl0 = pickle.dumps(x, 0)
				2152	>>> dis(pkl0)
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2153	0: ( MARK
				2154	1: l LIST (MARK at 0)
				2155	2: p PUT 0
Guido van Rossum	f410000	2007-01-15 00:21:46 +0000	[diff] [blame]	2156	5: L LONG 1
Mark Dickinson	8dd0514	2009-01-20 20:43:58 +0000	[diff] [blame]	2157	9: a APPEND
				2158	10: L LONG 2
				2159	14: a APPEND
				2160	15: ( MARK
				2161	16: L LONG 3
				2162	20: L LONG 4
				2163	24: t TUPLE (MARK at 15)
				2164	25: p PUT 1
				2165	28: a APPEND
				2166	29: ( MARK
				2167	30: d DICT (MARK at 29)
				2168	31: p PUT 2
Alexandre Vassalotti	3bfc65a	2011-12-13 13:08:09 -0500	[diff] [blame]	2169	34: c GLOBAL '_codecs encode'
				2170	50: p PUT 3
				2171	53: ( MARK
				2172	54: V UNICODE 'abc'
Antoine Pitrou	d9dfaa9	2009-06-04 20:32:06 +0000	[diff] [blame]	2173	59: p PUT 4
Alexandre Vassalotti	3bfc65a	2011-12-13 13:08:09 -0500	[diff] [blame]	2174	62: V UNICODE 'latin1'
				2175	70: p PUT 5
				2176	73: t TUPLE (MARK at 53)
				2177	74: p PUT 6
				2178	77: R REDUCE
				2179	78: p PUT 7
				2180	81: V UNICODE 'def'
				2181	86: p PUT 8
				2182	89: s SETITEM
				2183	90: a APPEND
				2184	91: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2185	highest protocol among opcodes = 0
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2186
				2187	Try again with a "binary" pickle.
				2188
Guido van Rossum	f416981	2008-03-17 22:56:06 +0000	[diff] [blame]	2189	>>> pkl1 = pickle.dumps(x, 1)
				2190	>>> dis(pkl1)
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2191	0: ] EMPTY_LIST
				2192	1: q BINPUT 0
				2193	3: ( MARK
				2194	4: K BININT1 1
				2195	6: K BININT1 2
				2196	8: ( MARK
				2197	9: K BININT1 3
				2198	11: K BININT1 4
				2199	13: t TUPLE (MARK at 8)
				2200	14: q BINPUT 1
				2201	16: } EMPTY_DICT
				2202	17: q BINPUT 2
Alexandre Vassalotti	3bfc65a	2011-12-13 13:08:09 -0500	[diff] [blame]	2203	19: c GLOBAL '_codecs encode'
				2204	35: q BINPUT 3
				2205	37: ( MARK
				2206	38: X BINUNICODE 'abc'
				2207	46: q BINPUT 4
				2208	48: X BINUNICODE 'latin1'
				2209	59: q BINPUT 5
				2210	61: t TUPLE (MARK at 37)
				2211	62: q BINPUT 6
				2212	64: R REDUCE
				2213	65: q BINPUT 7
				2214	67: X BINUNICODE 'def'
				2215	75: q BINPUT 8
				2216	77: s SETITEM
				2217	78: e APPENDS (MARK at 3)
				2218	79: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2219	highest protocol among opcodes = 1
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2220
				2221	Exercise the INST/OBJ/BUILD family.
				2222
Mark Dickinson	cddcf44	2009-01-24 21:46:33 +0000	[diff] [blame]	2223	>>> import pickletools
				2224	>>> dis(pickle.dumps(pickletools.dis, 0))
				2225	0: c GLOBAL 'pickletools dis'
				2226	17: p PUT 0
				2227	20: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2228	highest protocol among opcodes = 0
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2229
Tim Peters	90718a4	2005-02-15 16:22:34 +0000	[diff] [blame]	2230	>>> from pickletools import _Example
				2231	>>> x = [_Example(42)] * 2
Guido van Rossum	f29d3d6	2003-01-27 22:47:53 +0000	[diff] [blame]	2232	>>> dis(pickle.dumps(x, 0))
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2233	0: ( MARK
				2234	1: l LIST (MARK at 0)
				2235	2: p PUT 0
Antoine Pitrou	d9dfaa9	2009-06-04 20:32:06 +0000	[diff] [blame]	2236	5: c GLOBAL 'copy_reg _reconstructor'
				2237	30: p PUT 1
				2238	33: ( MARK
				2239	34: c GLOBAL 'pickletools _Example'
				2240	56: p PUT 2
				2241	59: c GLOBAL '__builtin__ object'
				2242	79: p PUT 3
				2243	82: N NONE
				2244	83: t TUPLE (MARK at 33)
				2245	84: p PUT 4
				2246	87: R REDUCE
				2247	88: p PUT 5
				2248	91: ( MARK
				2249	92: d DICT (MARK at 91)
				2250	93: p PUT 6
				2251	96: V UNICODE 'value'
				2252	103: p PUT 7
				2253	106: L LONG 42
				2254	111: s SETITEM
				2255	112: b BUILD
Mark Dickinson	8dd0514	2009-01-20 20:43:58 +0000	[diff] [blame]	2256	113: a APPEND
Antoine Pitrou	d9dfaa9	2009-06-04 20:32:06 +0000	[diff] [blame]	2257	114: g GET 5
				2258	117: a APPEND
				2259	118: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2260	highest protocol among opcodes = 0
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2261
				2262	>>> dis(pickle.dumps(x, 1))
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2263	0: ] EMPTY_LIST
				2264	1: q BINPUT 0
				2265	3: ( MARK
Antoine Pitrou	d9dfaa9	2009-06-04 20:32:06 +0000	[diff] [blame]	2266	4: c GLOBAL 'copy_reg _reconstructor'
				2267	29: q BINPUT 1
				2268	31: ( MARK
				2269	32: c GLOBAL 'pickletools _Example'
				2270	54: q BINPUT 2
				2271	56: c GLOBAL '__builtin__ object'
				2272	76: q BINPUT 3
				2273	78: N NONE
				2274	79: t TUPLE (MARK at 31)
				2275	80: q BINPUT 4
				2276	82: R REDUCE
				2277	83: q BINPUT 5
				2278	85: } EMPTY_DICT
				2279	86: q BINPUT 6
				2280	88: X BINUNICODE 'value'
				2281	98: q BINPUT 7
				2282	100: K BININT1 42
				2283	102: s SETITEM
				2284	103: b BUILD
				2285	104: h BINGET 5
				2286	106: e APPENDS (MARK at 3)
				2287	107: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2288	highest protocol among opcodes = 1
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2289
				2290	Try "the canonical" recursive-object test.
				2291
				2292	>>> L = []
				2293	>>> T = L,
				2294	>>> L.append(T)
				2295	>>> L[0] is T
				2296	True
				2297	>>> T[0] is L
				2298	True
				2299	>>> L[0][0] is L
				2300	True
				2301	>>> T[0][0] is T
				2302	True
Guido van Rossum	f29d3d6	2003-01-27 22:47:53 +0000	[diff] [blame]	2303	>>> dis(pickle.dumps(L, 0))
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2304	0: ( MARK
				2305	1: l LIST (MARK at 0)
				2306	2: p PUT 0
				2307	5: ( MARK
				2308	6: g GET 0
				2309	9: t TUPLE (MARK at 5)
				2310	10: p PUT 1
				2311	13: a APPEND
				2312	14: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2313	highest protocol among opcodes = 0
				2314
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2315	>>> dis(pickle.dumps(L, 1))
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2316	0: ] EMPTY_LIST
				2317	1: q BINPUT 0
				2318	3: ( MARK
				2319	4: h BINGET 0
				2320	6: t TUPLE (MARK at 3)
				2321	7: q BINPUT 1
				2322	9: a APPEND
				2323	10: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2324	highest protocol among opcodes = 1
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2325
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2326	Note that, in the protocol 0 pickle of the recursive tuple, the disassembler
				2327	has to emulate the stack in order to realize that the POP opcode at 16 gets
				2328	rid of the MARK at 0.
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2329
Guido van Rossum	f29d3d6	2003-01-27 22:47:53 +0000	[diff] [blame]	2330	>>> dis(pickle.dumps(T, 0))
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2331	0: ( MARK
				2332	1: ( MARK
				2333	2: l LIST (MARK at 1)
				2334	3: p PUT 0
				2335	6: ( MARK
				2336	7: g GET 0
				2337	10: t TUPLE (MARK at 6)
				2338	11: p PUT 1
				2339	14: a APPEND
				2340	15: 0 POP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2341	16: 0 POP (MARK at 0)
				2342	17: g GET 1
				2343	20: . STOP
				2344	highest protocol among opcodes = 0
				2345
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2346	>>> dis(pickle.dumps(T, 1))
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2347	0: ( MARK
				2348	1: ] EMPTY_LIST
				2349	2: q BINPUT 0
				2350	4: ( MARK
				2351	5: h BINGET 0
				2352	7: t TUPLE (MARK at 4)
				2353	8: q BINPUT 1
				2354	10: a APPEND
				2355	11: 1 POP_MARK (MARK at 0)
				2356	12: h BINGET 1
				2357	14: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2358	highest protocol among opcodes = 1
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2359
				2360	Try protocol 2.
				2361
				2362	>>> dis(pickle.dumps(L, 2))
				2363	0: \x80 PROTO 2
				2364	2: ] EMPTY_LIST
				2365	3: q BINPUT 0
				2366	5: h BINGET 0
				2367	7: \x85 TUPLE1
				2368	8: q BINPUT 1
				2369	10: a APPEND
				2370	11: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2371	highest protocol among opcodes = 2
Tim Peters	d0f7c86	2003-01-28 15:27:57 +0000	[diff] [blame]	2372
				2373	>>> dis(pickle.dumps(T, 2))
				2374	0: \x80 PROTO 2
				2375	2: ] EMPTY_LIST
				2376	3: q BINPUT 0
				2377	5: h BINGET 0
				2378	7: \x85 TUPLE1
				2379	8: q BINPUT 1
				2380	10: a APPEND
				2381	11: 0 POP
				2382	12: h BINGET 1
				2383	14: . STOP
Tim Peters	c1c2b3e	2003-01-29 20:12:21 +0000	[diff] [blame]	2384	highest protocol among opcodes = 2
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	2385
				2386	Try protocol 3 with annotations:
				2387
				2388	>>> dis(pickle.dumps(T, 3), annotate=1)
				2389	0: \x80 PROTO 3 Protocol version indicator.
				2390	2: ] EMPTY_LIST Push an empty list.
				2391	3: q BINPUT 0 Store the stack top into the memo. The stack is not popped.
				2392	5: h BINGET 0 Read an object from the memo and push it on the stack.
				2393	7: \x85 TUPLE1 Build a one-tuple out of the topmost item on the stack.
				2394	8: q BINPUT 1 Store the stack top into the memo. The stack is not popped.
				2395	10: a APPEND Append an object to a list.
				2396	11: 0 POP Discard the top stack item, shrinking the stack by one item.
				2397	12: h BINGET 1 Read an object from the memo and push it on the stack.
				2398	14: . STOP Stop the unpickling machine.
				2399	highest protocol among opcodes = 2
				2400
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2401	"""
				2402
Tim Peters	62235e7	2003-02-05 19:55:53 +0000	[diff] [blame]	2403	_memo_test = r"""
				2404	>>> import pickle
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	2405	>>> import io
				2406	>>> f = io.BytesIO()
Tim Peters	62235e7	2003-02-05 19:55:53 +0000	[diff] [blame]	2407	>>> p = pickle.Pickler(f, 2)
				2408	>>> x = [1, 2, 3]
				2409	>>> p.dump(x)
				2410	>>> p.dump(x)
				2411	>>> f.seek(0)
Guido van Rossum	cfe5f20	2007-05-08 21:26:54 +0000	[diff] [blame]	2412	0
Tim Peters	62235e7	2003-02-05 19:55:53 +0000	[diff] [blame]	2413	>>> memo = {}
				2414	>>> dis(f, memo=memo)
				2415	0: \x80 PROTO 2
				2416	2: ] EMPTY_LIST
				2417	3: q BINPUT 0
				2418	5: ( MARK
				2419	6: K BININT1 1
				2420	8: K BININT1 2
				2421	10: K BININT1 3
				2422	12: e APPENDS (MARK at 5)
				2423	13: . STOP
				2424	highest protocol among opcodes = 2
				2425	>>> dis(f, memo=memo)
				2426	14: \x80 PROTO 2
				2427	16: h BINGET 0
				2428	18: . STOP
				2429	highest protocol among opcodes = 2
				2430	"""
				2431
Guido van Rossum	5702835	2003-01-28 15:09:10 +0000	[diff] [blame]	2432	__test__ = {'disassembler_test': _dis_test,
Tim Peters	62235e7	2003-02-05 19:55:53 +0000	[diff] [blame]	2433	'disassembler_memo_test': _memo_test,
Tim Peters	8ecfc8e	2003-01-27 18:51:48 +0000	[diff] [blame]	2434	}
				2435
				2436	def _test():
				2437	import doctest
				2438	return doctest.testmod()
				2439
				2440	if __name__ == "__main__":
Alexander Belopolsky	60c762b	2010-07-03 20:35:53 +0000	[diff] [blame]	2441	import sys, argparse
				2442	parser = argparse.ArgumentParser(
				2443	description='disassemble one or more pickle files')
				2444	parser.add_argument(
				2445	'pickle_file', type=argparse.FileType('br'),
				2446	nargs='*', help='the pickle file')
				2447	parser.add_argument(
				2448	'-o', '--output', default=sys.stdout, type=argparse.FileType('w'),
				2449	help='the file where the output should be written')
				2450	parser.add_argument(
				2451	'-m', '--memo', action='store_true',
				2452	help='preserve memo between disassemblies')
				2453	parser.add_argument(
				2454	'-l', '--indentlevel', default=4, type=int,
				2455	help='the number of blanks by which to indent a new MARK level')
				2456	parser.add_argument(
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	2457	'-a', '--annotate', action='store_true',
				2458	help='annotate each line with a short opcode description')
				2459	parser.add_argument(
Alexander Belopolsky	60c762b	2010-07-03 20:35:53 +0000	[diff] [blame]	2460	'-p', '--preamble', default="==> {name} <==",
				2461	help='if more than one pickle file is specified, print this before'
				2462	' each disassembly')
				2463	parser.add_argument(
				2464	'-t', '--test', action='store_true',
				2465	help='run self-test suite')
				2466	parser.add_argument(
				2467	'-v', action='store_true',
				2468	help='run verbosely; only affects self-test run')
				2469	args = parser.parse_args()
				2470	if args.test:
				2471	_test()
				2472	else:
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	2473	annotate = 30 if args.annotate else 0
Alexander Belopolsky	60c762b	2010-07-03 20:35:53 +0000	[diff] [blame]	2474	if not args.pickle_file:
				2475	parser.print_help()
				2476	elif len(args.pickle_file) == 1:
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	2477	dis(args.pickle_file[0], args.output, None,
				2478	args.indentlevel, annotate)
Alexander Belopolsky	60c762b	2010-07-03 20:35:53 +0000	[diff] [blame]	2479	else:
				2480	memo = {} if args.memo else None
				2481	for f in args.pickle_file:
				2482	preamble = args.preamble.format(name=f.name)
				2483	args.output.write(preamble + '\n')
Alexander Belopolsky	929d384	2010-07-17 15:51:21 +0000	[diff] [blame]	2484	dis(f, args.output, memo, args.indentlevel, annotate)