blob: f1d395817973347b484ca78d237f2192757f4e8f [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
Nadeem Vawdae8604042012-06-04 23:38:12 +020021 "open", "compress", "decompress", "is_check_supported",
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020022]
23
Nadeem Vawdae8604042012-06-04 23:38:12 +020024import builtins
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020025import io
26from _lzma import *
Nadeem Vawdaa425c3d2012-06-21 23:36:48 +020027from _lzma import _encode_filter_properties, _decode_filter_properties
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020028
29
30_MODE_CLOSED = 0
31_MODE_READ = 1
32_MODE_READ_EOF = 2
33_MODE_WRITE = 3
34
35_BUFFER_SIZE = 8192
36
37
38class LZMAFile(io.BufferedIOBase):
39
40 """A file object providing transparent LZMA (de)compression.
41
42 An LZMAFile can act as a wrapper for an existing file object, or
43 refer directly to a named file on disk.
44
45 Note that LZMAFile provides a *binary* file interface - data read
46 is returned as bytes, and data to be written must be given as bytes.
47 """
48
49 def __init__(self, filename=None, mode="r", *,
Nadeem Vawda33c34da2012-06-04 23:34:07 +020050 format=None, check=-1, preset=None, filters=None):
51 """Open an LZMA-compressed file in binary mode.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020052
Nadeem Vawda33c34da2012-06-04 23:34:07 +020053 filename can be either an actual file name (given as a str or
54 bytes object), in which case the named file is opened, or it can
55 be an existing file object to read from or write to.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020056
Nadeem Vawda42ca9822013-10-19 00:06:19 +020057 mode can be "r" for reading (default), "w" for (over)writing,
58 "x" for creating exclusively, or "a" for appending. These can
59 equivalently be given as "rb", "wb", "xb" and "ab" respectively.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020060
61 format specifies the container format to use for the file.
62 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
63 default is FORMAT_XZ.
64
65 check specifies the integrity check to use. This argument can
66 only be used when opening a file for writing. For FORMAT_XZ,
67 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
68 support integrity checks - for these formats, check must be
69 omitted, or be CHECK_NONE.
70
71 When opening a file for reading, the *preset* argument is not
72 meaningful, and should be omitted. The *filters* argument should
73 also be omitted, except when format is FORMAT_RAW (in which case
74 it is required).
75
76 When opening a file for writing, the settings used by the
77 compressor can be specified either as a preset compression
78 level (with the *preset* argument), or in detail as a custom
79 filter chain (with the *filters* argument). For FORMAT_XZ and
80 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
81 level. For FORMAT_RAW, the caller must always specify a filter
82 chain; the raw compressor does not support preset compression
83 levels.
84
85 preset (if provided) should be an integer in the range 0-9,
86 optionally OR-ed with the constant PRESET_EXTREME.
87
88 filters (if provided) should be a sequence of dicts. Each dict
89 should have an entry for "id" indicating ID of the filter, plus
90 additional entries for options to the filter.
91 """
92 self._fp = None
93 self._closefp = False
94 self._mode = _MODE_CLOSED
95 self._pos = 0
96 self._size = -1
97
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020098 if mode in ("r", "rb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020099 if check != -1:
100 raise ValueError("Cannot specify an integrity check "
101 "when opening a file for reading")
102 if preset is not None:
103 raise ValueError("Cannot specify a preset compression "
104 "level when opening a file for reading")
105 if format is None:
106 format = FORMAT_AUTO
107 mode_code = _MODE_READ
108 # Save the args to pass to the LZMADecompressor initializer.
109 # If the file contains multiple compressed streams, each
110 # stream will need a separate decompressor object.
111 self._init_args = {"format":format, "filters":filters}
112 self._decompressor = LZMADecompressor(**self._init_args)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200113 self._buffer = b""
114 self._buffer_offset = 0
Nadeem Vawda42ca9822013-10-19 00:06:19 +0200115 elif mode in ("w", "wb", "a", "ab", "x", "xb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200116 if format is None:
117 format = FORMAT_XZ
118 mode_code = _MODE_WRITE
119 self._compressor = LZMACompressor(format=format, check=check,
120 preset=preset, filters=filters)
121 else:
122 raise ValueError("Invalid mode: {!r}".format(mode))
123
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200124 if isinstance(filename, (str, bytes)):
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200125 if "b" not in mode:
126 mode += "b"
Nadeem Vawdae8604042012-06-04 23:38:12 +0200127 self._fp = builtins.open(filename, mode)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200128 self._closefp = True
129 self._mode = mode_code
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200130 elif hasattr(filename, "read") or hasattr(filename, "write"):
131 self._fp = filename
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200132 self._mode = mode_code
133 else:
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200134 raise TypeError("filename must be a str or bytes object, or a file")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200135
136 def close(self):
137 """Flush and close the file.
138
139 May be called more than once without error. Once the file is
140 closed, any other operation on it will raise a ValueError.
141 """
142 if self._mode == _MODE_CLOSED:
143 return
144 try:
145 if self._mode in (_MODE_READ, _MODE_READ_EOF):
146 self._decompressor = None
Nadeem Vawda186370b2012-10-21 16:57:32 +0200147 self._buffer = b""
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200148 elif self._mode == _MODE_WRITE:
149 self._fp.write(self._compressor.flush())
150 self._compressor = None
151 finally:
152 try:
153 if self._closefp:
154 self._fp.close()
155 finally:
156 self._fp = None
157 self._closefp = False
158 self._mode = _MODE_CLOSED
159
160 @property
161 def closed(self):
162 """True if this file is closed."""
163 return self._mode == _MODE_CLOSED
164
165 def fileno(self):
166 """Return the file descriptor for the underlying file."""
167 self._check_not_closed()
168 return self._fp.fileno()
169
170 def seekable(self):
171 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200172 return self.readable() and self._fp.seekable()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200173
174 def readable(self):
175 """Return whether the file was opened for reading."""
176 self._check_not_closed()
177 return self._mode in (_MODE_READ, _MODE_READ_EOF)
178
179 def writable(self):
180 """Return whether the file was opened for writing."""
181 self._check_not_closed()
182 return self._mode == _MODE_WRITE
183
184 # Mode-checking helper functions.
185
186 def _check_not_closed(self):
187 if self.closed:
188 raise ValueError("I/O operation on closed file")
189
190 def _check_can_read(self):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200191 if self._mode not in (_MODE_READ, _MODE_READ_EOF):
192 self._check_not_closed()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200193 raise io.UnsupportedOperation("File not open for reading")
194
195 def _check_can_write(self):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200196 if self._mode != _MODE_WRITE:
197 self._check_not_closed()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200198 raise io.UnsupportedOperation("File not open for writing")
199
200 def _check_can_seek(self):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200201 if self._mode not in (_MODE_READ, _MODE_READ_EOF):
202 self._check_not_closed()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200203 raise io.UnsupportedOperation("Seeking is only supported "
204 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200205 if not self._fp.seekable():
206 raise io.UnsupportedOperation("The underlying file object "
207 "does not support seeking")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200208
209 # Fill the readahead buffer if it is empty. Returns False on EOF.
210 def _fill_buffer(self):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200211 if self._mode == _MODE_READ_EOF:
212 return False
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200213 # Depending on the input data, our call to the decompressor may not
214 # return any data. In this case, try again after reading another block.
Nadeem Vawda186370b2012-10-21 16:57:32 +0200215 while self._buffer_offset == len(self._buffer):
216 rawblock = (self._decompressor.unused_data or
217 self._fp.read(_BUFFER_SIZE))
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200218
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200219 if not rawblock:
220 if self._decompressor.eof:
221 self._mode = _MODE_READ_EOF
222 self._size = self._pos
223 return False
224 else:
225 raise EOFError("Compressed file ended before the "
226 "end-of-stream marker was reached")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200227
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200228 if self._decompressor.eof:
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100229 # Continue to next stream.
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200230 self._decompressor = LZMADecompressor(**self._init_args)
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100231 try:
232 self._buffer = self._decompressor.decompress(rawblock)
233 except LZMAError:
234 # Trailing data isn't a valid compressed stream; ignore it.
235 self._mode = _MODE_READ_EOF
236 self._size = self._pos
237 return False
238 else:
239 self._buffer = self._decompressor.decompress(rawblock)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200240 self._buffer_offset = 0
241 return True
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200242
243 # Read data until EOF.
244 # If return_data is false, consume the data without returning it.
245 def _read_all(self, return_data=True):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200246 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
247 self._buffer = self._buffer[self._buffer_offset:]
248 self._buffer_offset = 0
249
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200250 blocks = []
251 while self._fill_buffer():
252 if return_data:
253 blocks.append(self._buffer)
254 self._pos += len(self._buffer)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200255 self._buffer = b""
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200256 if return_data:
257 return b"".join(blocks)
258
259 # Read a block of up to n bytes.
260 # If return_data is false, consume the data without returning it.
261 def _read_block(self, n, return_data=True):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200262 # If we have enough data buffered, return immediately.
263 end = self._buffer_offset + n
264 if end <= len(self._buffer):
265 data = self._buffer[self._buffer_offset : end]
266 self._buffer_offset = end
267 self._pos += len(data)
268 return data if return_data else None
269
270 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
271 self._buffer = self._buffer[self._buffer_offset:]
272 self._buffer_offset = 0
273
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200274 blocks = []
275 while n > 0 and self._fill_buffer():
276 if n < len(self._buffer):
277 data = self._buffer[:n]
Nadeem Vawda186370b2012-10-21 16:57:32 +0200278 self._buffer_offset = n
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200279 else:
280 data = self._buffer
Nadeem Vawda186370b2012-10-21 16:57:32 +0200281 self._buffer = b""
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200282 if return_data:
283 blocks.append(data)
284 self._pos += len(data)
285 n -= len(data)
286 if return_data:
287 return b"".join(blocks)
288
289 def peek(self, size=-1):
290 """Return buffered data without advancing the file position.
291
292 Always returns at least one byte of data, unless at EOF.
293 The exact number of bytes returned is unspecified.
294 """
295 self._check_can_read()
Nadeem Vawda186370b2012-10-21 16:57:32 +0200296 if not self._fill_buffer():
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200297 return b""
Nadeem Vawda186370b2012-10-21 16:57:32 +0200298 return self._buffer[self._buffer_offset:]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200299
300 def read(self, size=-1):
301 """Read up to size uncompressed bytes from the file.
302
303 If size is negative or omitted, read until EOF is reached.
304 Returns b"" if the file is already at EOF.
305 """
306 self._check_can_read()
Nadeem Vawda186370b2012-10-21 16:57:32 +0200307 if size == 0:
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200308 return b""
309 elif size < 0:
310 return self._read_all()
311 else:
312 return self._read_block(size)
313
314 def read1(self, size=-1):
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200315 """Read up to size uncompressed bytes, while trying to avoid
316 making multiple reads from the underlying stream.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200317
318 Returns b"" if the file is at EOF.
319 """
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200320 # Usually, read1() calls _fp.read() at most once. However, sometimes
321 # this does not give enough data for the decompressor to make progress.
322 # In this case we make multiple reads, to avoid returning b"".
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200323 self._check_can_read()
Nadeem Vawda186370b2012-10-21 16:57:32 +0200324 if (size == 0 or
325 # Only call _fill_buffer() if the buffer is actually empty.
326 # This gives a significant speedup if *size* is small.
327 (self._buffer_offset == len(self._buffer) and not self._fill_buffer())):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200328 return b""
Nadeem Vawda186370b2012-10-21 16:57:32 +0200329 if size > 0:
330 data = self._buffer[self._buffer_offset :
331 self._buffer_offset + size]
332 self._buffer_offset += len(data)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200333 else:
Nadeem Vawda186370b2012-10-21 16:57:32 +0200334 data = self._buffer[self._buffer_offset:]
335 self._buffer = b""
336 self._buffer_offset = 0
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200337 self._pos += len(data)
338 return data
339
Nadeem Vawda186370b2012-10-21 16:57:32 +0200340 def readline(self, size=-1):
341 """Read a line of uncompressed bytes from the file.
342
343 The terminating newline (if present) is retained. If size is
344 non-negative, no more than size bytes will be read (in which
345 case the line may be incomplete). Returns b'' if already at EOF.
346 """
347 self._check_can_read()
348 # Shortcut for the common case - the whole line is in the buffer.
349 if size < 0:
350 end = self._buffer.find(b"\n", self._buffer_offset) + 1
351 if end > 0:
352 line = self._buffer[self._buffer_offset : end]
353 self._buffer_offset = end
354 self._pos += len(line)
355 return line
356 return io.BufferedIOBase.readline(self, size)
357
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200358 def write(self, data):
359 """Write a bytes object to the file.
360
361 Returns the number of uncompressed bytes written, which is
362 always len(data). Note that due to buffering, the file on disk
363 may not reflect the data written until close() is called.
364 """
365 self._check_can_write()
366 compressed = self._compressor.compress(data)
367 self._fp.write(compressed)
368 self._pos += len(data)
369 return len(data)
370
371 # Rewind the file to the beginning of the data stream.
372 def _rewind(self):
373 self._fp.seek(0, 0)
374 self._mode = _MODE_READ
375 self._pos = 0
376 self._decompressor = LZMADecompressor(**self._init_args)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200377 self._buffer = b""
378 self._buffer_offset = 0
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200379
380 def seek(self, offset, whence=0):
381 """Change the file position.
382
383 The new position is specified by offset, relative to the
384 position indicated by whence. Possible values for whence are:
385
386 0: start of stream (default): offset must not be negative
387 1: current stream position
388 2: end of stream; offset must not be positive
389
390 Returns the new file position.
391
392 Note that seeking is emulated, sp depending on the parameters,
393 this operation may be extremely slow.
394 """
395 self._check_can_seek()
396
397 # Recalculate offset as an absolute file position.
398 if whence == 0:
399 pass
400 elif whence == 1:
401 offset = self._pos + offset
402 elif whence == 2:
403 # Seeking relative to EOF - we need to know the file's size.
404 if self._size < 0:
405 self._read_all(return_data=False)
406 offset = self._size + offset
407 else:
408 raise ValueError("Invalid value for whence: {}".format(whence))
409
410 # Make it so that offset is the number of bytes to skip forward.
411 if offset < self._pos:
412 self._rewind()
413 else:
414 offset -= self._pos
415
416 # Read and discard data until we reach the desired position.
Nadeem Vawda186370b2012-10-21 16:57:32 +0200417 self._read_block(offset, return_data=False)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200418
419 return self._pos
420
421 def tell(self):
422 """Return the current file position."""
423 self._check_not_closed()
424 return self._pos
425
426
Nadeem Vawdae8604042012-06-04 23:38:12 +0200427def open(filename, mode="rb", *,
428 format=None, check=-1, preset=None, filters=None,
429 encoding=None, errors=None, newline=None):
430 """Open an LZMA-compressed file in binary or text mode.
431
Nadeem Vawda801985e2012-10-13 04:26:49 +0200432 filename can be either an actual file name (given as a str or bytes
433 object), in which case the named file is opened, or it can be an
434 existing file object to read from or write to.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200435
Nadeem Vawda42ca9822013-10-19 00:06:19 +0200436 The mode argument can be "r", "rb" (default), "w", "wb", "x", "xb",
437 "a", or "ab" for binary mode, or "rt", "wt", "xt", or "at" for text
438 mode.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200439
Nadeem Vawda801985e2012-10-13 04:26:49 +0200440 The format, check, preset and filters arguments specify the
441 compression settings, as for LZMACompressor, LZMADecompressor and
442 LZMAFile.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200443
Nadeem Vawda801985e2012-10-13 04:26:49 +0200444 For binary mode, this function is equivalent to the LZMAFile
445 constructor: LZMAFile(filename, mode, ...). In this case, the
446 encoding, errors and newline arguments must not be provided.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200447
448 For text mode, a LZMAFile object is created, and wrapped in an
Nadeem Vawda801985e2012-10-13 04:26:49 +0200449 io.TextIOWrapper instance with the specified encoding, error
450 handling behavior, and line ending(s).
Nadeem Vawdae8604042012-06-04 23:38:12 +0200451
452 """
453 if "t" in mode:
454 if "b" in mode:
455 raise ValueError("Invalid mode: %r" % (mode,))
456 else:
457 if encoding is not None:
458 raise ValueError("Argument 'encoding' not supported in binary mode")
459 if errors is not None:
460 raise ValueError("Argument 'errors' not supported in binary mode")
461 if newline is not None:
462 raise ValueError("Argument 'newline' not supported in binary mode")
463
464 lz_mode = mode.replace("t", "")
465 binary_file = LZMAFile(filename, lz_mode, format=format, check=check,
466 preset=preset, filters=filters)
467
468 if "t" in mode:
469 return io.TextIOWrapper(binary_file, encoding, errors, newline)
470 else:
471 return binary_file
472
473
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200474def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
475 """Compress a block of data.
476
477 Refer to LZMACompressor's docstring for a description of the
478 optional arguments *format*, *check*, *preset* and *filters*.
479
Nadeem Vawda801985e2012-10-13 04:26:49 +0200480 For incremental compression, use an LZMACompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200481 """
482 comp = LZMACompressor(format, check, preset, filters)
483 return comp.compress(data) + comp.flush()
484
485
486def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
487 """Decompress a block of data.
488
489 Refer to LZMADecompressor's docstring for a description of the
490 optional arguments *format*, *check* and *filters*.
491
Nadeem Vawda801985e2012-10-13 04:26:49 +0200492 For incremental decompression, use an LZMADecompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200493 """
494 results = []
495 while True:
496 decomp = LZMADecompressor(format, memlimit, filters)
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100497 try:
498 res = decomp.decompress(data)
499 except LZMAError:
500 if results:
501 break # Leftover data is not a valid LZMA/XZ stream; ignore it.
502 else:
503 raise # Error on the first iteration; bail out.
504 results.append(res)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200505 if not decomp.eof:
506 raise LZMAError("Compressed data ended before the "
507 "end-of-stream marker was reached")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200508 data = decomp.unused_data
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100509 if not data:
510 break
511 return b"".join(results)