Blame - Lib/lzma.py - platform/external/python/cpython3

blob: 1a1b065f8e99847fd06c724d6764e313f9af2795 [file] [log] [blame]

Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	1	"""Interface to the liblzma compression library.
				2
				3	This module provides a class for reading and writing compressed files,
				4	classes for incremental (de)compression, and convenience functions for
				5	one-shot (de)compression.
				6
				7	These classes and functions support both the XZ and legacy LZMA
				8	container formats, as well as raw compressed data streams.
				9	"""
				10
				11	__all__ = [
				12	"CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
				13	"CHECK_ID_MAX", "CHECK_UNKNOWN",
				14	"FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
				15	"FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
				16	"FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
				17	"MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
				18	"MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
				19
				20	"LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
Nadeem Vawda	e860404	2012-06-04 23:38:12 +0200	[diff] [blame]	21	"open", "compress", "decompress", "is_check_supported",
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	22	]
				23
Nadeem Vawda	e860404	2012-06-04 23:38:12 +0200	[diff] [blame]	24	import builtins
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	25	import io
				26	from _lzma import *
Nadeem Vawda	a425c3d	2012-06-21 23:36:48 +0200	[diff] [blame]	27	from _lzma import _encode_filter_properties, _decode_filter_properties
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	28
				29
				30	_MODE_CLOSED = 0
				31	_MODE_READ = 1
				32	_MODE_READ_EOF = 2
				33	_MODE_WRITE = 3
				34
				35	_BUFFER_SIZE = 8192
				36
				37
				38	class LZMAFile(io.BufferedIOBase):
				39
				40	"""A file object providing transparent LZMA (de)compression.
				41
				42	An LZMAFile can act as a wrapper for an existing file object, or
				43	refer directly to a named file on disk.
				44
				45	Note that LZMAFile provides a binary file interface - data read
				46	is returned as bytes, and data to be written must be given as bytes.
				47	"""
				48
				49	def __init__(self, filename=None, mode="r", *,
Nadeem Vawda	33c34da	2012-06-04 23:34:07 +0200	[diff] [blame]	50	format=None, check=-1, preset=None, filters=None):
				51	"""Open an LZMA-compressed file in binary mode.
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	52
Nadeem Vawda	33c34da	2012-06-04 23:34:07 +0200	[diff] [blame]	53	filename can be either an actual file name (given as a str or
				54	bytes object), in which case the named file is opened, or it can
				55	be an existing file object to read from or write to.
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	56
				57	mode can be "r" for reading (default), "w" for (over)writing, or
Nadeem Vawda	6cbb20c	2012-06-04 23:36:24 +0200	[diff] [blame]	58	"a" for appending. These can equivalently be given as "rb", "wb",
				59	and "ab" respectively.
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	60
				61	format specifies the container format to use for the file.
				62	If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
				63	default is FORMAT_XZ.
				64
				65	check specifies the integrity check to use. This argument can
				66	only be used when opening a file for writing. For FORMAT_XZ,
				67	the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
				68	support integrity checks - for these formats, check must be
				69	omitted, or be CHECK_NONE.
				70
				71	When opening a file for reading, the preset argument is not
				72	meaningful, and should be omitted. The filters argument should
				73	also be omitted, except when format is FORMAT_RAW (in which case
				74	it is required).
				75
				76	When opening a file for writing, the settings used by the
				77	compressor can be specified either as a preset compression
				78	level (with the preset argument), or in detail as a custom
				79	filter chain (with the filters argument). For FORMAT_XZ and
				80	FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
				81	level. For FORMAT_RAW, the caller must always specify a filter
				82	chain; the raw compressor does not support preset compression
				83	levels.
				84
				85	preset (if provided) should be an integer in the range 0-9,
				86	optionally OR-ed with the constant PRESET_EXTREME.
				87
				88	filters (if provided) should be a sequence of dicts. Each dict
				89	should have an entry for "id" indicating ID of the filter, plus
				90	additional entries for options to the filter.
				91	"""
				92	self._fp = None
				93	self._closefp = False
				94	self._mode = _MODE_CLOSED
				95	self._pos = 0
				96	self._size = -1
				97
Nadeem Vawda	6cbb20c	2012-06-04 23:36:24 +0200	[diff] [blame]	98	if mode in ("r", "rb"):
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	99	if check != -1:
				100	raise ValueError("Cannot specify an integrity check "
				101	"when opening a file for reading")
				102	if preset is not None:
				103	raise ValueError("Cannot specify a preset compression "
				104	"level when opening a file for reading")
				105	if format is None:
				106	format = FORMAT_AUTO
				107	mode_code = _MODE_READ
				108	# Save the args to pass to the LZMADecompressor initializer.
				109	# If the file contains multiple compressed streams, each
				110	# stream will need a separate decompressor object.
				111	self._init_args = {"format":format, "filters":filters}
				112	self._decompressor = LZMADecompressor(**self._init_args)
				113	self._buffer = None
Nadeem Vawda	6cbb20c	2012-06-04 23:36:24 +0200	[diff] [blame]	114	elif mode in ("w", "wb", "a", "ab"):
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	115	if format is None:
				116	format = FORMAT_XZ
				117	mode_code = _MODE_WRITE
				118	self._compressor = LZMACompressor(format=format, check=check,
				119	preset=preset, filters=filters)
				120	else:
				121	raise ValueError("Invalid mode: {!r}".format(mode))
				122
Nadeem Vawda	33c34da	2012-06-04 23:34:07 +0200	[diff] [blame]	123	if isinstance(filename, (str, bytes)):
Nadeem Vawda	6cbb20c	2012-06-04 23:36:24 +0200	[diff] [blame]	124	if "b" not in mode:
				125	mode += "b"
Nadeem Vawda	e860404	2012-06-04 23:38:12 +0200	[diff] [blame]	126	self._fp = builtins.open(filename, mode)
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	127	self._closefp = True
				128	self._mode = mode_code
Nadeem Vawda	33c34da	2012-06-04 23:34:07 +0200	[diff] [blame]	129	elif hasattr(filename, "read") or hasattr(filename, "write"):
				130	self._fp = filename
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	131	self._mode = mode_code
				132	else:
Nadeem Vawda	33c34da	2012-06-04 23:34:07 +0200	[diff] [blame]	133	raise TypeError("filename must be a str or bytes object, or a file")
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	134
				135	def close(self):
				136	"""Flush and close the file.
				137
				138	May be called more than once without error. Once the file is
				139	closed, any other operation on it will raise a ValueError.
				140	"""
				141	if self._mode == _MODE_CLOSED:
				142	return
				143	try:
				144	if self._mode in (_MODE_READ, _MODE_READ_EOF):
				145	self._decompressor = None
				146	self._buffer = None
				147	elif self._mode == _MODE_WRITE:
				148	self._fp.write(self._compressor.flush())
				149	self._compressor = None
				150	finally:
				151	try:
				152	if self._closefp:
				153	self._fp.close()
				154	finally:
				155	self._fp = None
				156	self._closefp = False
				157	self._mode = _MODE_CLOSED
				158
				159	@property
				160	def closed(self):
				161	"""True if this file is closed."""
				162	return self._mode == _MODE_CLOSED
				163
				164	def fileno(self):
				165	"""Return the file descriptor for the underlying file."""
				166	self._check_not_closed()
				167	return self._fp.fileno()
				168
				169	def seekable(self):
				170	"""Return whether the file supports seeking."""
Nadeem Vawda	ae557d7	2012-02-12 01:51:38 +0200	[diff] [blame]	171	return self.readable() and self._fp.seekable()
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	172
				173	def readable(self):
				174	"""Return whether the file was opened for reading."""
				175	self._check_not_closed()
				176	return self._mode in (_MODE_READ, _MODE_READ_EOF)
				177
				178	def writable(self):
				179	"""Return whether the file was opened for writing."""
				180	self._check_not_closed()
				181	return self._mode == _MODE_WRITE
				182
				183	# Mode-checking helper functions.
				184
				185	def _check_not_closed(self):
				186	if self.closed:
				187	raise ValueError("I/O operation on closed file")
				188
				189	def _check_can_read(self):
				190	if not self.readable():
				191	raise io.UnsupportedOperation("File not open for reading")
				192
				193	def _check_can_write(self):
				194	if not self.writable():
				195	raise io.UnsupportedOperation("File not open for writing")
				196
				197	def _check_can_seek(self):
Nadeem Vawda	ae557d7	2012-02-12 01:51:38 +0200	[diff] [blame]	198	if not self.readable():
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	199	raise io.UnsupportedOperation("Seeking is only supported "
				200	"on files open for reading")
Nadeem Vawda	ae557d7	2012-02-12 01:51:38 +0200	[diff] [blame]	201	if not self._fp.seekable():
				202	raise io.UnsupportedOperation("The underlying file object "
				203	"does not support seeking")
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	204
				205	# Fill the readahead buffer if it is empty. Returns False on EOF.
				206	def _fill_buffer(self):
Nadeem Vawda	37d3ff1	2012-08-05 02:19:09 +0200	[diff] [blame]	207	# Depending on the input data, our call to the decompressor may not
				208	# return any data. In this case, try again after reading another block.
				209	while True:
				210	if self._buffer:
				211	return True
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	212
Nadeem Vawda	37d3ff1	2012-08-05 02:19:09 +0200	[diff] [blame]	213	if self._decompressor.unused_data:
				214	rawblock = self._decompressor.unused_data
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	215	else:
Nadeem Vawda	37d3ff1	2012-08-05 02:19:09 +0200	[diff] [blame]	216	rawblock = self._fp.read(_BUFFER_SIZE)
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	217
Nadeem Vawda	37d3ff1	2012-08-05 02:19:09 +0200	[diff] [blame]	218	if not rawblock:
				219	if self._decompressor.eof:
				220	self._mode = _MODE_READ_EOF
				221	self._size = self._pos
				222	return False
				223	else:
				224	raise EOFError("Compressed file ended before the "
				225	"end-of-stream marker was reached")
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	226
Nadeem Vawda	37d3ff1	2012-08-05 02:19:09 +0200	[diff] [blame]	227	# Continue to next stream.
				228	if self._decompressor.eof:
				229	self._decompressor = LZMADecompressor(**self._init_args)
				230
				231	self._buffer = self._decompressor.decompress(rawblock)
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	232
				233	# Read data until EOF.
				234	# If return_data is false, consume the data without returning it.
				235	def _read_all(self, return_data=True):
				236	blocks = []
				237	while self._fill_buffer():
				238	if return_data:
				239	blocks.append(self._buffer)
				240	self._pos += len(self._buffer)
				241	self._buffer = None
				242	if return_data:
				243	return b"".join(blocks)
				244
				245	# Read a block of up to n bytes.
				246	# If return_data is false, consume the data without returning it.
				247	def _read_block(self, n, return_data=True):
				248	blocks = []
				249	while n > 0 and self._fill_buffer():
				250	if n < len(self._buffer):
				251	data = self._buffer[:n]
				252	self._buffer = self._buffer[n:]
				253	else:
				254	data = self._buffer
				255	self._buffer = None
				256	if return_data:
				257	blocks.append(data)
				258	self._pos += len(data)
				259	n -= len(data)
				260	if return_data:
				261	return b"".join(blocks)
				262
				263	def peek(self, size=-1):
				264	"""Return buffered data without advancing the file position.
				265
				266	Always returns at least one byte of data, unless at EOF.
				267	The exact number of bytes returned is unspecified.
				268	"""
				269	self._check_can_read()
				270	if self._mode == _MODE_READ_EOF or not self._fill_buffer():
				271	return b""
				272	return self._buffer
				273
				274	def read(self, size=-1):
				275	"""Read up to size uncompressed bytes from the file.
				276
				277	If size is negative or omitted, read until EOF is reached.
				278	Returns b"" if the file is already at EOF.
				279	"""
				280	self._check_can_read()
				281	if self._mode == _MODE_READ_EOF or size == 0:
				282	return b""
				283	elif size < 0:
				284	return self._read_all()
				285	else:
				286	return self._read_block(size)
				287
				288	def read1(self, size=-1):
Nadeem Vawda	37d3ff1	2012-08-05 02:19:09 +0200	[diff] [blame]	289	"""Read up to size uncompressed bytes, while trying to avoid
				290	making multiple reads from the underlying stream.
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	291
				292	Returns b"" if the file is at EOF.
				293	"""
Nadeem Vawda	37d3ff1	2012-08-05 02:19:09 +0200	[diff] [blame]	294	# Usually, read1() calls _fp.read() at most once. However, sometimes
				295	# this does not give enough data for the decompressor to make progress.
				296	# In this case we make multiple reads, to avoid returning b"".
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	297	self._check_can_read()
				298	if (size == 0 or self._mode == _MODE_READ_EOF or
				299	not self._fill_buffer()):
				300	return b""
				301	if 0 < size < len(self._buffer):
				302	data = self._buffer[:size]
				303	self._buffer = self._buffer[size:]
				304	else:
				305	data = self._buffer
				306	self._buffer = None
				307	self._pos += len(data)
				308	return data
				309
				310	def write(self, data):
				311	"""Write a bytes object to the file.
				312
				313	Returns the number of uncompressed bytes written, which is
				314	always len(data). Note that due to buffering, the file on disk
				315	may not reflect the data written until close() is called.
				316	"""
				317	self._check_can_write()
				318	compressed = self._compressor.compress(data)
				319	self._fp.write(compressed)
				320	self._pos += len(data)
				321	return len(data)
				322
				323	# Rewind the file to the beginning of the data stream.
				324	def _rewind(self):
				325	self._fp.seek(0, 0)
				326	self._mode = _MODE_READ
				327	self._pos = 0
				328	self._decompressor = LZMADecompressor(**self._init_args)
				329	self._buffer = None
				330
				331	def seek(self, offset, whence=0):
				332	"""Change the file position.
				333
				334	The new position is specified by offset, relative to the
				335	position indicated by whence. Possible values for whence are:
				336
				337	0: start of stream (default): offset must not be negative
				338	1: current stream position
				339	2: end of stream; offset must not be positive
				340
				341	Returns the new file position.
				342
				343	Note that seeking is emulated, sp depending on the parameters,
				344	this operation may be extremely slow.
				345	"""
				346	self._check_can_seek()
				347
				348	# Recalculate offset as an absolute file position.
				349	if whence == 0:
				350	pass
				351	elif whence == 1:
				352	offset = self._pos + offset
				353	elif whence == 2:
				354	# Seeking relative to EOF - we need to know the file's size.
				355	if self._size < 0:
				356	self._read_all(return_data=False)
				357	offset = self._size + offset
				358	else:
				359	raise ValueError("Invalid value for whence: {}".format(whence))
				360
				361	# Make it so that offset is the number of bytes to skip forward.
				362	if offset < self._pos:
				363	self._rewind()
				364	else:
				365	offset -= self._pos
				366
				367	# Read and discard data until we reach the desired position.
				368	if self._mode != _MODE_READ_EOF:
				369	self._read_block(offset, return_data=False)
				370
				371	return self._pos
				372
				373	def tell(self):
				374	"""Return the current file position."""
				375	self._check_not_closed()
				376	return self._pos
				377
				378
Nadeem Vawda	e860404	2012-06-04 23:38:12 +0200	[diff] [blame]	379	def open(filename, mode="rb", *,
				380	format=None, check=-1, preset=None, filters=None,
				381	encoding=None, errors=None, newline=None):
				382	"""Open an LZMA-compressed file in binary or text mode.
				383
				384	filename can be either an actual file name (given as a str or bytes object),
				385	in which case the named file is opened, or it can be an existing file object
				386	to read from or write to.
				387
				388	The mode argument can be "r", "rb" (default), "w", "wb", "a", or "ab" for
				389	binary mode, or "rt", "wt" or "at" for text mode.
				390
				391	The format, check, preset and filters arguments specify the compression
				392	settings, as for LZMACompressor, LZMADecompressor and LZMAFile.
				393
				394	For binary mode, this function is equivalent to the LZMAFile constructor:
				395	LZMAFile(filename, mode, ...). In this case, the encoding, errors and
				396	newline arguments must not be provided.
				397
				398	For text mode, a LZMAFile object is created, and wrapped in an
				399	io.TextIOWrapper instance with the specified encoding, error handling
				400	behavior, and line ending(s).
				401
				402	"""
				403	if "t" in mode:
				404	if "b" in mode:
				405	raise ValueError("Invalid mode: %r" % (mode,))
				406	else:
				407	if encoding is not None:
				408	raise ValueError("Argument 'encoding' not supported in binary mode")
				409	if errors is not None:
				410	raise ValueError("Argument 'errors' not supported in binary mode")
				411	if newline is not None:
				412	raise ValueError("Argument 'newline' not supported in binary mode")
				413
				414	lz_mode = mode.replace("t", "")
				415	binary_file = LZMAFile(filename, lz_mode, format=format, check=check,
				416	preset=preset, filters=filters)
				417
				418	if "t" in mode:
				419	return io.TextIOWrapper(binary_file, encoding, errors, newline)
				420	else:
				421	return binary_file
				422
				423
Nadeem Vawda	3ff069e	2011-11-30 00:25:06 +0200	[diff] [blame]	424	def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
				425	"""Compress a block of data.
				426
				427	Refer to LZMACompressor's docstring for a description of the
				428	optional arguments format, check, preset and filters.
				429
				430	For incremental compression, use an LZMACompressor object instead.
				431	"""
				432	comp = LZMACompressor(format, check, preset, filters)
				433	return comp.compress(data) + comp.flush()
				434
				435
				436	def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
				437	"""Decompress a block of data.
				438
				439	Refer to LZMADecompressor's docstring for a description of the
				440	optional arguments format, check and filters.
				441
				442	For incremental decompression, use a LZMADecompressor object instead.
				443	"""
				444	results = []
				445	while True:
				446	decomp = LZMADecompressor(format, memlimit, filters)
				447	results.append(decomp.decompress(data))
				448	if not decomp.eof:
				449	raise LZMAError("Compressed data ended before the "
				450	"end-of-stream marker was reached")
				451	if not decomp.unused_data:
				452	return b"".join(results)
				453	# There is unused data left over. Proceed to next stream.
				454	data = decomp.unused_data