blob: 1a89887b39e1a8e936c844efa3c4322de9b7670c [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
Nadeem Vawdae8604042012-06-04 23:38:12 +020021 "open", "compress", "decompress", "is_check_supported",
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020022]
23
Nadeem Vawdae8604042012-06-04 23:38:12 +020024import builtins
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020025import io
26from _lzma import *
Nadeem Vawdaa425c3d2012-06-21 23:36:48 +020027from _lzma import _encode_filter_properties, _decode_filter_properties
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020028
29
30_MODE_CLOSED = 0
31_MODE_READ = 1
32_MODE_READ_EOF = 2
33_MODE_WRITE = 3
34
35_BUFFER_SIZE = 8192
36
37
38class LZMAFile(io.BufferedIOBase):
39
40 """A file object providing transparent LZMA (de)compression.
41
42 An LZMAFile can act as a wrapper for an existing file object, or
43 refer directly to a named file on disk.
44
45 Note that LZMAFile provides a *binary* file interface - data read
46 is returned as bytes, and data to be written must be given as bytes.
47 """
48
49 def __init__(self, filename=None, mode="r", *,
Nadeem Vawda33c34da2012-06-04 23:34:07 +020050 format=None, check=-1, preset=None, filters=None):
51 """Open an LZMA-compressed file in binary mode.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020052
Nadeem Vawda33c34da2012-06-04 23:34:07 +020053 filename can be either an actual file name (given as a str or
54 bytes object), in which case the named file is opened, or it can
55 be an existing file object to read from or write to.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020056
57 mode can be "r" for reading (default), "w" for (over)writing, or
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020058 "a" for appending. These can equivalently be given as "rb", "wb",
59 and "ab" respectively.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020060
61 format specifies the container format to use for the file.
62 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
63 default is FORMAT_XZ.
64
65 check specifies the integrity check to use. This argument can
66 only be used when opening a file for writing. For FORMAT_XZ,
67 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
68 support integrity checks - for these formats, check must be
69 omitted, or be CHECK_NONE.
70
71 When opening a file for reading, the *preset* argument is not
72 meaningful, and should be omitted. The *filters* argument should
73 also be omitted, except when format is FORMAT_RAW (in which case
74 it is required).
75
76 When opening a file for writing, the settings used by the
77 compressor can be specified either as a preset compression
78 level (with the *preset* argument), or in detail as a custom
79 filter chain (with the *filters* argument). For FORMAT_XZ and
80 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
81 level. For FORMAT_RAW, the caller must always specify a filter
82 chain; the raw compressor does not support preset compression
83 levels.
84
85 preset (if provided) should be an integer in the range 0-9,
86 optionally OR-ed with the constant PRESET_EXTREME.
87
88 filters (if provided) should be a sequence of dicts. Each dict
89 should have an entry for "id" indicating ID of the filter, plus
90 additional entries for options to the filter.
91 """
92 self._fp = None
93 self._closefp = False
94 self._mode = _MODE_CLOSED
95 self._pos = 0
96 self._size = -1
97
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020098 if mode in ("r", "rb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020099 if check != -1:
100 raise ValueError("Cannot specify an integrity check "
101 "when opening a file for reading")
102 if preset is not None:
103 raise ValueError("Cannot specify a preset compression "
104 "level when opening a file for reading")
105 if format is None:
106 format = FORMAT_AUTO
107 mode_code = _MODE_READ
108 # Save the args to pass to the LZMADecompressor initializer.
109 # If the file contains multiple compressed streams, each
110 # stream will need a separate decompressor object.
111 self._init_args = {"format":format, "filters":filters}
112 self._decompressor = LZMADecompressor(**self._init_args)
113 self._buffer = None
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200114 elif mode in ("w", "wb", "a", "ab"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200115 if format is None:
116 format = FORMAT_XZ
117 mode_code = _MODE_WRITE
118 self._compressor = LZMACompressor(format=format, check=check,
119 preset=preset, filters=filters)
120 else:
121 raise ValueError("Invalid mode: {!r}".format(mode))
122
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200123 if isinstance(filename, (str, bytes)):
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200124 if "b" not in mode:
125 mode += "b"
Nadeem Vawdae8604042012-06-04 23:38:12 +0200126 self._fp = builtins.open(filename, mode)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200127 self._closefp = True
128 self._mode = mode_code
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200129 elif hasattr(filename, "read") or hasattr(filename, "write"):
130 self._fp = filename
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200131 self._mode = mode_code
132 else:
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200133 raise TypeError("filename must be a str or bytes object, or a file")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200134
135 def close(self):
136 """Flush and close the file.
137
138 May be called more than once without error. Once the file is
139 closed, any other operation on it will raise a ValueError.
140 """
141 if self._mode == _MODE_CLOSED:
142 return
143 try:
144 if self._mode in (_MODE_READ, _MODE_READ_EOF):
145 self._decompressor = None
146 self._buffer = None
147 elif self._mode == _MODE_WRITE:
148 self._fp.write(self._compressor.flush())
149 self._compressor = None
150 finally:
151 try:
152 if self._closefp:
153 self._fp.close()
154 finally:
155 self._fp = None
156 self._closefp = False
157 self._mode = _MODE_CLOSED
158
159 @property
160 def closed(self):
161 """True if this file is closed."""
162 return self._mode == _MODE_CLOSED
163
164 def fileno(self):
165 """Return the file descriptor for the underlying file."""
166 self._check_not_closed()
167 return self._fp.fileno()
168
169 def seekable(self):
170 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200171 return self.readable() and self._fp.seekable()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200172
173 def readable(self):
174 """Return whether the file was opened for reading."""
175 self._check_not_closed()
176 return self._mode in (_MODE_READ, _MODE_READ_EOF)
177
178 def writable(self):
179 """Return whether the file was opened for writing."""
180 self._check_not_closed()
181 return self._mode == _MODE_WRITE
182
183 # Mode-checking helper functions.
184
185 def _check_not_closed(self):
186 if self.closed:
187 raise ValueError("I/O operation on closed file")
188
189 def _check_can_read(self):
190 if not self.readable():
191 raise io.UnsupportedOperation("File not open for reading")
192
193 def _check_can_write(self):
194 if not self.writable():
195 raise io.UnsupportedOperation("File not open for writing")
196
197 def _check_can_seek(self):
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200198 if not self.readable():
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200199 raise io.UnsupportedOperation("Seeking is only supported "
200 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200201 if not self._fp.seekable():
202 raise io.UnsupportedOperation("The underlying file object "
203 "does not support seeking")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200204
205 # Fill the readahead buffer if it is empty. Returns False on EOF.
206 def _fill_buffer(self):
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200207 # Depending on the input data, our call to the decompressor may not
208 # return any data. In this case, try again after reading another block.
209 while True:
210 if self._buffer:
211 return True
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200212
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200213 if self._decompressor.unused_data:
214 rawblock = self._decompressor.unused_data
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200215 else:
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200216 rawblock = self._fp.read(_BUFFER_SIZE)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200217
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200218 if not rawblock:
219 if self._decompressor.eof:
220 self._mode = _MODE_READ_EOF
221 self._size = self._pos
222 return False
223 else:
224 raise EOFError("Compressed file ended before the "
225 "end-of-stream marker was reached")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200226
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200227 if self._decompressor.eof:
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100228 # Continue to next stream.
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200229 self._decompressor = LZMADecompressor(**self._init_args)
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100230 try:
231 self._buffer = self._decompressor.decompress(rawblock)
232 except LZMAError:
233 # Trailing data isn't a valid compressed stream; ignore it.
234 self._mode = _MODE_READ_EOF
235 self._size = self._pos
236 return False
237 else:
238 self._buffer = self._decompressor.decompress(rawblock)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200239
240 # Read data until EOF.
241 # If return_data is false, consume the data without returning it.
242 def _read_all(self, return_data=True):
243 blocks = []
244 while self._fill_buffer():
245 if return_data:
246 blocks.append(self._buffer)
247 self._pos += len(self._buffer)
248 self._buffer = None
249 if return_data:
250 return b"".join(blocks)
251
252 # Read a block of up to n bytes.
253 # If return_data is false, consume the data without returning it.
254 def _read_block(self, n, return_data=True):
255 blocks = []
256 while n > 0 and self._fill_buffer():
257 if n < len(self._buffer):
258 data = self._buffer[:n]
259 self._buffer = self._buffer[n:]
260 else:
261 data = self._buffer
262 self._buffer = None
263 if return_data:
264 blocks.append(data)
265 self._pos += len(data)
266 n -= len(data)
267 if return_data:
268 return b"".join(blocks)
269
270 def peek(self, size=-1):
271 """Return buffered data without advancing the file position.
272
273 Always returns at least one byte of data, unless at EOF.
274 The exact number of bytes returned is unspecified.
275 """
276 self._check_can_read()
277 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
278 return b""
279 return self._buffer
280
281 def read(self, size=-1):
282 """Read up to size uncompressed bytes from the file.
283
284 If size is negative or omitted, read until EOF is reached.
285 Returns b"" if the file is already at EOF.
286 """
287 self._check_can_read()
288 if self._mode == _MODE_READ_EOF or size == 0:
289 return b""
290 elif size < 0:
291 return self._read_all()
292 else:
293 return self._read_block(size)
294
295 def read1(self, size=-1):
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200296 """Read up to size uncompressed bytes, while trying to avoid
297 making multiple reads from the underlying stream.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200298
299 Returns b"" if the file is at EOF.
300 """
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200301 # Usually, read1() calls _fp.read() at most once. However, sometimes
302 # this does not give enough data for the decompressor to make progress.
303 # In this case we make multiple reads, to avoid returning b"".
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200304 self._check_can_read()
305 if (size == 0 or self._mode == _MODE_READ_EOF or
306 not self._fill_buffer()):
307 return b""
308 if 0 < size < len(self._buffer):
309 data = self._buffer[:size]
310 self._buffer = self._buffer[size:]
311 else:
312 data = self._buffer
313 self._buffer = None
314 self._pos += len(data)
315 return data
316
317 def write(self, data):
318 """Write a bytes object to the file.
319
320 Returns the number of uncompressed bytes written, which is
321 always len(data). Note that due to buffering, the file on disk
322 may not reflect the data written until close() is called.
323 """
324 self._check_can_write()
325 compressed = self._compressor.compress(data)
326 self._fp.write(compressed)
327 self._pos += len(data)
328 return len(data)
329
330 # Rewind the file to the beginning of the data stream.
331 def _rewind(self):
332 self._fp.seek(0, 0)
333 self._mode = _MODE_READ
334 self._pos = 0
335 self._decompressor = LZMADecompressor(**self._init_args)
336 self._buffer = None
337
338 def seek(self, offset, whence=0):
339 """Change the file position.
340
341 The new position is specified by offset, relative to the
342 position indicated by whence. Possible values for whence are:
343
344 0: start of stream (default): offset must not be negative
345 1: current stream position
346 2: end of stream; offset must not be positive
347
348 Returns the new file position.
349
350 Note that seeking is emulated, sp depending on the parameters,
351 this operation may be extremely slow.
352 """
353 self._check_can_seek()
354
355 # Recalculate offset as an absolute file position.
356 if whence == 0:
357 pass
358 elif whence == 1:
359 offset = self._pos + offset
360 elif whence == 2:
361 # Seeking relative to EOF - we need to know the file's size.
362 if self._size < 0:
363 self._read_all(return_data=False)
364 offset = self._size + offset
365 else:
366 raise ValueError("Invalid value for whence: {}".format(whence))
367
368 # Make it so that offset is the number of bytes to skip forward.
369 if offset < self._pos:
370 self._rewind()
371 else:
372 offset -= self._pos
373
374 # Read and discard data until we reach the desired position.
375 if self._mode != _MODE_READ_EOF:
376 self._read_block(offset, return_data=False)
377
378 return self._pos
379
380 def tell(self):
381 """Return the current file position."""
382 self._check_not_closed()
383 return self._pos
384
385
Nadeem Vawdae8604042012-06-04 23:38:12 +0200386def open(filename, mode="rb", *,
387 format=None, check=-1, preset=None, filters=None,
388 encoding=None, errors=None, newline=None):
389 """Open an LZMA-compressed file in binary or text mode.
390
391 filename can be either an actual file name (given as a str or bytes object),
392 in which case the named file is opened, or it can be an existing file object
393 to read from or write to.
394
395 The mode argument can be "r", "rb" (default), "w", "wb", "a", or "ab" for
396 binary mode, or "rt", "wt" or "at" for text mode.
397
398 The format, check, preset and filters arguments specify the compression
399 settings, as for LZMACompressor, LZMADecompressor and LZMAFile.
400
401 For binary mode, this function is equivalent to the LZMAFile constructor:
402 LZMAFile(filename, mode, ...). In this case, the encoding, errors and
403 newline arguments must not be provided.
404
405 For text mode, a LZMAFile object is created, and wrapped in an
406 io.TextIOWrapper instance with the specified encoding, error handling
407 behavior, and line ending(s).
408
409 """
410 if "t" in mode:
411 if "b" in mode:
412 raise ValueError("Invalid mode: %r" % (mode,))
413 else:
414 if encoding is not None:
415 raise ValueError("Argument 'encoding' not supported in binary mode")
416 if errors is not None:
417 raise ValueError("Argument 'errors' not supported in binary mode")
418 if newline is not None:
419 raise ValueError("Argument 'newline' not supported in binary mode")
420
421 lz_mode = mode.replace("t", "")
422 binary_file = LZMAFile(filename, lz_mode, format=format, check=check,
423 preset=preset, filters=filters)
424
425 if "t" in mode:
426 return io.TextIOWrapper(binary_file, encoding, errors, newline)
427 else:
428 return binary_file
429
430
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200431def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
432 """Compress a block of data.
433
434 Refer to LZMACompressor's docstring for a description of the
435 optional arguments *format*, *check*, *preset* and *filters*.
436
437 For incremental compression, use an LZMACompressor object instead.
438 """
439 comp = LZMACompressor(format, check, preset, filters)
440 return comp.compress(data) + comp.flush()
441
442
443def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
444 """Decompress a block of data.
445
446 Refer to LZMADecompressor's docstring for a description of the
447 optional arguments *format*, *check* and *filters*.
448
449 For incremental decompression, use a LZMADecompressor object instead.
450 """
451 results = []
452 while True:
453 decomp = LZMADecompressor(format, memlimit, filters)
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100454 try:
455 res = decomp.decompress(data)
456 except LZMAError:
457 if results:
458 break # Leftover data is not a valid LZMA/XZ stream; ignore it.
459 else:
460 raise # Error on the first iteration; bail out.
461 results.append(res)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200462 if not decomp.eof:
463 raise LZMAError("Compressed data ended before the "
464 "end-of-stream marker was reached")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200465 data = decomp.unused_data
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100466 if not data:
467 break
468 return b"".join(results)