blob: b2e2f7ed0eab2c54e4d8beac877595ead97c3f13 [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
Nadeem Vawdae8604042012-06-04 23:38:12 +020021 "open", "compress", "decompress", "is_check_supported",
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020022]
23
Nadeem Vawdae8604042012-06-04 23:38:12 +020024import builtins
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020025import io
26from _lzma import *
Nadeem Vawdaa425c3d2012-06-21 23:36:48 +020027from _lzma import _encode_filter_properties, _decode_filter_properties
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020028
29
30_MODE_CLOSED = 0
31_MODE_READ = 1
32_MODE_READ_EOF = 2
33_MODE_WRITE = 3
34
35_BUFFER_SIZE = 8192
36
37
38class LZMAFile(io.BufferedIOBase):
39
40 """A file object providing transparent LZMA (de)compression.
41
42 An LZMAFile can act as a wrapper for an existing file object, or
43 refer directly to a named file on disk.
44
45 Note that LZMAFile provides a *binary* file interface - data read
46 is returned as bytes, and data to be written must be given as bytes.
47 """
48
49 def __init__(self, filename=None, mode="r", *,
Nadeem Vawda33c34da2012-06-04 23:34:07 +020050 format=None, check=-1, preset=None, filters=None):
51 """Open an LZMA-compressed file in binary mode.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020052
Nadeem Vawda33c34da2012-06-04 23:34:07 +020053 filename can be either an actual file name (given as a str or
54 bytes object), in which case the named file is opened, or it can
55 be an existing file object to read from or write to.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020056
57 mode can be "r" for reading (default), "w" for (over)writing, or
Nadeem Vawda801985e2012-10-13 04:26:49 +020058 "a" for appending. These can equivalently be given as "rb", "wb"
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020059 and "ab" respectively.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020060
61 format specifies the container format to use for the file.
62 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
63 default is FORMAT_XZ.
64
65 check specifies the integrity check to use. This argument can
66 only be used when opening a file for writing. For FORMAT_XZ,
67 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
68 support integrity checks - for these formats, check must be
69 omitted, or be CHECK_NONE.
70
71 When opening a file for reading, the *preset* argument is not
72 meaningful, and should be omitted. The *filters* argument should
73 also be omitted, except when format is FORMAT_RAW (in which case
74 it is required).
75
76 When opening a file for writing, the settings used by the
77 compressor can be specified either as a preset compression
78 level (with the *preset* argument), or in detail as a custom
79 filter chain (with the *filters* argument). For FORMAT_XZ and
80 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
81 level. For FORMAT_RAW, the caller must always specify a filter
82 chain; the raw compressor does not support preset compression
83 levels.
84
85 preset (if provided) should be an integer in the range 0-9,
86 optionally OR-ed with the constant PRESET_EXTREME.
87
88 filters (if provided) should be a sequence of dicts. Each dict
89 should have an entry for "id" indicating ID of the filter, plus
90 additional entries for options to the filter.
91 """
92 self._fp = None
93 self._closefp = False
94 self._mode = _MODE_CLOSED
95 self._pos = 0
96 self._size = -1
97
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020098 if mode in ("r", "rb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020099 if check != -1:
100 raise ValueError("Cannot specify an integrity check "
101 "when opening a file for reading")
102 if preset is not None:
103 raise ValueError("Cannot specify a preset compression "
104 "level when opening a file for reading")
105 if format is None:
106 format = FORMAT_AUTO
107 mode_code = _MODE_READ
108 # Save the args to pass to the LZMADecompressor initializer.
109 # If the file contains multiple compressed streams, each
110 # stream will need a separate decompressor object.
111 self._init_args = {"format":format, "filters":filters}
112 self._decompressor = LZMADecompressor(**self._init_args)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200113 self._buffer = b""
114 self._buffer_offset = 0
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200115 elif mode in ("w", "wb", "a", "ab"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200116 if format is None:
117 format = FORMAT_XZ
118 mode_code = _MODE_WRITE
119 self._compressor = LZMACompressor(format=format, check=check,
120 preset=preset, filters=filters)
121 else:
122 raise ValueError("Invalid mode: {!r}".format(mode))
123
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200124 if isinstance(filename, (str, bytes)):
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200125 if "b" not in mode:
126 mode += "b"
Nadeem Vawdae8604042012-06-04 23:38:12 +0200127 self._fp = builtins.open(filename, mode)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200128 self._closefp = True
129 self._mode = mode_code
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200130 elif hasattr(filename, "read") or hasattr(filename, "write"):
131 self._fp = filename
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200132 self._mode = mode_code
133 else:
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200134 raise TypeError("filename must be a str or bytes object, or a file")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200135
136 def close(self):
137 """Flush and close the file.
138
139 May be called more than once without error. Once the file is
140 closed, any other operation on it will raise a ValueError.
141 """
142 if self._mode == _MODE_CLOSED:
143 return
144 try:
145 if self._mode in (_MODE_READ, _MODE_READ_EOF):
146 self._decompressor = None
Nadeem Vawda186370b2012-10-21 16:57:32 +0200147 self._buffer = b""
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200148 elif self._mode == _MODE_WRITE:
149 self._fp.write(self._compressor.flush())
150 self._compressor = None
151 finally:
152 try:
153 if self._closefp:
154 self._fp.close()
155 finally:
156 self._fp = None
157 self._closefp = False
158 self._mode = _MODE_CLOSED
159
160 @property
161 def closed(self):
162 """True if this file is closed."""
163 return self._mode == _MODE_CLOSED
164
165 def fileno(self):
166 """Return the file descriptor for the underlying file."""
167 self._check_not_closed()
168 return self._fp.fileno()
169
170 def seekable(self):
171 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200172 return self.readable() and self._fp.seekable()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200173
174 def readable(self):
175 """Return whether the file was opened for reading."""
176 self._check_not_closed()
177 return self._mode in (_MODE_READ, _MODE_READ_EOF)
178
179 def writable(self):
180 """Return whether the file was opened for writing."""
181 self._check_not_closed()
182 return self._mode == _MODE_WRITE
183
184 # Mode-checking helper functions.
185
186 def _check_not_closed(self):
187 if self.closed:
188 raise ValueError("I/O operation on closed file")
189
190 def _check_can_read(self):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200191 if self._mode not in (_MODE_READ, _MODE_READ_EOF):
192 self._check_not_closed()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200193 raise io.UnsupportedOperation("File not open for reading")
194
195 def _check_can_write(self):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200196 if self._mode != _MODE_WRITE:
197 self._check_not_closed()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200198 raise io.UnsupportedOperation("File not open for writing")
199
200 def _check_can_seek(self):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200201 if self._mode not in (_MODE_READ, _MODE_READ_EOF):
202 self._check_not_closed()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200203 raise io.UnsupportedOperation("Seeking is only supported "
204 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200205 if not self._fp.seekable():
206 raise io.UnsupportedOperation("The underlying file object "
207 "does not support seeking")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200208
209 # Fill the readahead buffer if it is empty. Returns False on EOF.
210 def _fill_buffer(self):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200211 if self._mode == _MODE_READ_EOF:
212 return False
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200213 # Depending on the input data, our call to the decompressor may not
214 # return any data. In this case, try again after reading another block.
Nadeem Vawda186370b2012-10-21 16:57:32 +0200215 while self._buffer_offset == len(self._buffer):
216 rawblock = (self._decompressor.unused_data or
217 self._fp.read(_BUFFER_SIZE))
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200218
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200219 if not rawblock:
220 if self._decompressor.eof:
221 self._mode = _MODE_READ_EOF
222 self._size = self._pos
223 return False
224 else:
225 raise EOFError("Compressed file ended before the "
226 "end-of-stream marker was reached")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200227
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200228 # Continue to next stream.
229 if self._decompressor.eof:
230 self._decompressor = LZMADecompressor(**self._init_args)
231
232 self._buffer = self._decompressor.decompress(rawblock)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200233 self._buffer_offset = 0
234 return True
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200235
236 # Read data until EOF.
237 # If return_data is false, consume the data without returning it.
238 def _read_all(self, return_data=True):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200239 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
240 self._buffer = self._buffer[self._buffer_offset:]
241 self._buffer_offset = 0
242
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200243 blocks = []
244 while self._fill_buffer():
245 if return_data:
246 blocks.append(self._buffer)
247 self._pos += len(self._buffer)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200248 self._buffer = b""
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200249 if return_data:
250 return b"".join(blocks)
251
252 # Read a block of up to n bytes.
253 # If return_data is false, consume the data without returning it.
254 def _read_block(self, n, return_data=True):
Nadeem Vawda186370b2012-10-21 16:57:32 +0200255 # If we have enough data buffered, return immediately.
256 end = self._buffer_offset + n
257 if end <= len(self._buffer):
258 data = self._buffer[self._buffer_offset : end]
259 self._buffer_offset = end
260 self._pos += len(data)
261 return data if return_data else None
262
263 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
264 self._buffer = self._buffer[self._buffer_offset:]
265 self._buffer_offset = 0
266
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200267 blocks = []
268 while n > 0 and self._fill_buffer():
269 if n < len(self._buffer):
270 data = self._buffer[:n]
Nadeem Vawda186370b2012-10-21 16:57:32 +0200271 self._buffer_offset = n
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200272 else:
273 data = self._buffer
Nadeem Vawda186370b2012-10-21 16:57:32 +0200274 self._buffer = b""
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200275 if return_data:
276 blocks.append(data)
277 self._pos += len(data)
278 n -= len(data)
279 if return_data:
280 return b"".join(blocks)
281
282 def peek(self, size=-1):
283 """Return buffered data without advancing the file position.
284
285 Always returns at least one byte of data, unless at EOF.
286 The exact number of bytes returned is unspecified.
287 """
288 self._check_can_read()
Nadeem Vawda186370b2012-10-21 16:57:32 +0200289 if not self._fill_buffer():
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200290 return b""
Nadeem Vawda186370b2012-10-21 16:57:32 +0200291 return self._buffer[self._buffer_offset:]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200292
293 def read(self, size=-1):
294 """Read up to size uncompressed bytes from the file.
295
296 If size is negative or omitted, read until EOF is reached.
297 Returns b"" if the file is already at EOF.
298 """
299 self._check_can_read()
Nadeem Vawda186370b2012-10-21 16:57:32 +0200300 if size == 0:
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200301 return b""
302 elif size < 0:
303 return self._read_all()
304 else:
305 return self._read_block(size)
306
307 def read1(self, size=-1):
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200308 """Read up to size uncompressed bytes, while trying to avoid
309 making multiple reads from the underlying stream.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200310
311 Returns b"" if the file is at EOF.
312 """
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200313 # Usually, read1() calls _fp.read() at most once. However, sometimes
314 # this does not give enough data for the decompressor to make progress.
315 # In this case we make multiple reads, to avoid returning b"".
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200316 self._check_can_read()
Nadeem Vawda186370b2012-10-21 16:57:32 +0200317 if (size == 0 or
318 # Only call _fill_buffer() if the buffer is actually empty.
319 # This gives a significant speedup if *size* is small.
320 (self._buffer_offset == len(self._buffer) and not self._fill_buffer())):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200321 return b""
Nadeem Vawda186370b2012-10-21 16:57:32 +0200322 if size > 0:
323 data = self._buffer[self._buffer_offset :
324 self._buffer_offset + size]
325 self._buffer_offset += len(data)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200326 else:
Nadeem Vawda186370b2012-10-21 16:57:32 +0200327 data = self._buffer[self._buffer_offset:]
328 self._buffer = b""
329 self._buffer_offset = 0
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200330 self._pos += len(data)
331 return data
332
Nadeem Vawda186370b2012-10-21 16:57:32 +0200333 def readline(self, size=-1):
334 """Read a line of uncompressed bytes from the file.
335
336 The terminating newline (if present) is retained. If size is
337 non-negative, no more than size bytes will be read (in which
338 case the line may be incomplete). Returns b'' if already at EOF.
339 """
340 self._check_can_read()
341 # Shortcut for the common case - the whole line is in the buffer.
342 if size < 0:
343 end = self._buffer.find(b"\n", self._buffer_offset) + 1
344 if end > 0:
345 line = self._buffer[self._buffer_offset : end]
346 self._buffer_offset = end
347 self._pos += len(line)
348 return line
349 return io.BufferedIOBase.readline(self, size)
350
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200351 def write(self, data):
352 """Write a bytes object to the file.
353
354 Returns the number of uncompressed bytes written, which is
355 always len(data). Note that due to buffering, the file on disk
356 may not reflect the data written until close() is called.
357 """
358 self._check_can_write()
359 compressed = self._compressor.compress(data)
360 self._fp.write(compressed)
361 self._pos += len(data)
362 return len(data)
363
364 # Rewind the file to the beginning of the data stream.
365 def _rewind(self):
366 self._fp.seek(0, 0)
367 self._mode = _MODE_READ
368 self._pos = 0
369 self._decompressor = LZMADecompressor(**self._init_args)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200370 self._buffer = b""
371 self._buffer_offset = 0
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200372
373 def seek(self, offset, whence=0):
374 """Change the file position.
375
376 The new position is specified by offset, relative to the
377 position indicated by whence. Possible values for whence are:
378
379 0: start of stream (default): offset must not be negative
380 1: current stream position
381 2: end of stream; offset must not be positive
382
383 Returns the new file position.
384
385 Note that seeking is emulated, sp depending on the parameters,
386 this operation may be extremely slow.
387 """
388 self._check_can_seek()
389
390 # Recalculate offset as an absolute file position.
391 if whence == 0:
392 pass
393 elif whence == 1:
394 offset = self._pos + offset
395 elif whence == 2:
396 # Seeking relative to EOF - we need to know the file's size.
397 if self._size < 0:
398 self._read_all(return_data=False)
399 offset = self._size + offset
400 else:
401 raise ValueError("Invalid value for whence: {}".format(whence))
402
403 # Make it so that offset is the number of bytes to skip forward.
404 if offset < self._pos:
405 self._rewind()
406 else:
407 offset -= self._pos
408
409 # Read and discard data until we reach the desired position.
Nadeem Vawda186370b2012-10-21 16:57:32 +0200410 self._read_block(offset, return_data=False)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200411
412 return self._pos
413
414 def tell(self):
415 """Return the current file position."""
416 self._check_not_closed()
417 return self._pos
418
419
Nadeem Vawdae8604042012-06-04 23:38:12 +0200420def open(filename, mode="rb", *,
421 format=None, check=-1, preset=None, filters=None,
422 encoding=None, errors=None, newline=None):
423 """Open an LZMA-compressed file in binary or text mode.
424
Nadeem Vawda801985e2012-10-13 04:26:49 +0200425 filename can be either an actual file name (given as a str or bytes
426 object), in which case the named file is opened, or it can be an
427 existing file object to read from or write to.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200428
Nadeem Vawda801985e2012-10-13 04:26:49 +0200429 The mode argument can be "r", "rb" (default), "w", "wb", "a" or "ab"
430 for binary mode, or "rt", "wt" or "at" for text mode.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200431
Nadeem Vawda801985e2012-10-13 04:26:49 +0200432 The format, check, preset and filters arguments specify the
433 compression settings, as for LZMACompressor, LZMADecompressor and
434 LZMAFile.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200435
Nadeem Vawda801985e2012-10-13 04:26:49 +0200436 For binary mode, this function is equivalent to the LZMAFile
437 constructor: LZMAFile(filename, mode, ...). In this case, the
438 encoding, errors and newline arguments must not be provided.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200439
440 For text mode, a LZMAFile object is created, and wrapped in an
Nadeem Vawda801985e2012-10-13 04:26:49 +0200441 io.TextIOWrapper instance with the specified encoding, error
442 handling behavior, and line ending(s).
Nadeem Vawdae8604042012-06-04 23:38:12 +0200443
444 """
445 if "t" in mode:
446 if "b" in mode:
447 raise ValueError("Invalid mode: %r" % (mode,))
448 else:
449 if encoding is not None:
450 raise ValueError("Argument 'encoding' not supported in binary mode")
451 if errors is not None:
452 raise ValueError("Argument 'errors' not supported in binary mode")
453 if newline is not None:
454 raise ValueError("Argument 'newline' not supported in binary mode")
455
456 lz_mode = mode.replace("t", "")
457 binary_file = LZMAFile(filename, lz_mode, format=format, check=check,
458 preset=preset, filters=filters)
459
460 if "t" in mode:
461 return io.TextIOWrapper(binary_file, encoding, errors, newline)
462 else:
463 return binary_file
464
465
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200466def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
467 """Compress a block of data.
468
469 Refer to LZMACompressor's docstring for a description of the
470 optional arguments *format*, *check*, *preset* and *filters*.
471
Nadeem Vawda801985e2012-10-13 04:26:49 +0200472 For incremental compression, use an LZMACompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200473 """
474 comp = LZMACompressor(format, check, preset, filters)
475 return comp.compress(data) + comp.flush()
476
477
478def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
479 """Decompress a block of data.
480
481 Refer to LZMADecompressor's docstring for a description of the
482 optional arguments *format*, *check* and *filters*.
483
Nadeem Vawda801985e2012-10-13 04:26:49 +0200484 For incremental decompression, use an LZMADecompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200485 """
486 results = []
487 while True:
488 decomp = LZMADecompressor(format, memlimit, filters)
489 results.append(decomp.decompress(data))
490 if not decomp.eof:
491 raise LZMAError("Compressed data ended before the "
492 "end-of-stream marker was reached")
493 if not decomp.unused_data:
494 return b"".join(results)
495 # There is unused data left over. Proceed to next stream.
496 data = decomp.unused_data