blob: 07906910c5641b11621958859306add92ce5fd23 [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
Nadeem Vawdabc459bb2012-05-06 23:01:51 +020021 "compress", "decompress", "is_check_supported",
Nadeem Vawdaf55b3292012-05-06 23:01:27 +020022 "encode_filter_properties", "decode_filter_properties",
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020023]
24
25import io
26from _lzma import *
27
28
29_MODE_CLOSED = 0
30_MODE_READ = 1
31_MODE_READ_EOF = 2
32_MODE_WRITE = 3
33
34_BUFFER_SIZE = 8192
35
36
37class LZMAFile(io.BufferedIOBase):
38
39 """A file object providing transparent LZMA (de)compression.
40
41 An LZMAFile can act as a wrapper for an existing file object, or
42 refer directly to a named file on disk.
43
44 Note that LZMAFile provides a *binary* file interface - data read
45 is returned as bytes, and data to be written must be given as bytes.
46 """
47
48 def __init__(self, filename=None, mode="r", *,
Nadeem Vawda33c34da2012-06-04 23:34:07 +020049 format=None, check=-1, preset=None, filters=None):
50 """Open an LZMA-compressed file in binary mode.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020051
Nadeem Vawda33c34da2012-06-04 23:34:07 +020052 filename can be either an actual file name (given as a str or
53 bytes object), in which case the named file is opened, or it can
54 be an existing file object to read from or write to.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020055
56 mode can be "r" for reading (default), "w" for (over)writing, or
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020057 "a" for appending. These can equivalently be given as "rb", "wb",
58 and "ab" respectively.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020059
60 format specifies the container format to use for the file.
61 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
62 default is FORMAT_XZ.
63
64 check specifies the integrity check to use. This argument can
65 only be used when opening a file for writing. For FORMAT_XZ,
66 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
67 support integrity checks - for these formats, check must be
68 omitted, or be CHECK_NONE.
69
70 When opening a file for reading, the *preset* argument is not
71 meaningful, and should be omitted. The *filters* argument should
72 also be omitted, except when format is FORMAT_RAW (in which case
73 it is required).
74
75 When opening a file for writing, the settings used by the
76 compressor can be specified either as a preset compression
77 level (with the *preset* argument), or in detail as a custom
78 filter chain (with the *filters* argument). For FORMAT_XZ and
79 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
80 level. For FORMAT_RAW, the caller must always specify a filter
81 chain; the raw compressor does not support preset compression
82 levels.
83
84 preset (if provided) should be an integer in the range 0-9,
85 optionally OR-ed with the constant PRESET_EXTREME.
86
87 filters (if provided) should be a sequence of dicts. Each dict
88 should have an entry for "id" indicating ID of the filter, plus
89 additional entries for options to the filter.
90 """
91 self._fp = None
92 self._closefp = False
93 self._mode = _MODE_CLOSED
94 self._pos = 0
95 self._size = -1
96
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020097 if mode in ("r", "rb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020098 if check != -1:
99 raise ValueError("Cannot specify an integrity check "
100 "when opening a file for reading")
101 if preset is not None:
102 raise ValueError("Cannot specify a preset compression "
103 "level when opening a file for reading")
104 if format is None:
105 format = FORMAT_AUTO
106 mode_code = _MODE_READ
107 # Save the args to pass to the LZMADecompressor initializer.
108 # If the file contains multiple compressed streams, each
109 # stream will need a separate decompressor object.
110 self._init_args = {"format":format, "filters":filters}
111 self._decompressor = LZMADecompressor(**self._init_args)
112 self._buffer = None
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200113 elif mode in ("w", "wb", "a", "ab"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200114 if format is None:
115 format = FORMAT_XZ
116 mode_code = _MODE_WRITE
117 self._compressor = LZMACompressor(format=format, check=check,
118 preset=preset, filters=filters)
119 else:
120 raise ValueError("Invalid mode: {!r}".format(mode))
121
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200122 if isinstance(filename, (str, bytes)):
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200123 if "b" not in mode:
124 mode += "b"
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200125 self._fp = open(filename, mode)
126 self._closefp = True
127 self._mode = mode_code
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200128 elif hasattr(filename, "read") or hasattr(filename, "write"):
129 self._fp = filename
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200130 self._mode = mode_code
131 else:
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200132 raise TypeError("filename must be a str or bytes object, or a file")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200133
134 def close(self):
135 """Flush and close the file.
136
137 May be called more than once without error. Once the file is
138 closed, any other operation on it will raise a ValueError.
139 """
140 if self._mode == _MODE_CLOSED:
141 return
142 try:
143 if self._mode in (_MODE_READ, _MODE_READ_EOF):
144 self._decompressor = None
145 self._buffer = None
146 elif self._mode == _MODE_WRITE:
147 self._fp.write(self._compressor.flush())
148 self._compressor = None
149 finally:
150 try:
151 if self._closefp:
152 self._fp.close()
153 finally:
154 self._fp = None
155 self._closefp = False
156 self._mode = _MODE_CLOSED
157
158 @property
159 def closed(self):
160 """True if this file is closed."""
161 return self._mode == _MODE_CLOSED
162
163 def fileno(self):
164 """Return the file descriptor for the underlying file."""
165 self._check_not_closed()
166 return self._fp.fileno()
167
168 def seekable(self):
169 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200170 return self.readable() and self._fp.seekable()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200171
172 def readable(self):
173 """Return whether the file was opened for reading."""
174 self._check_not_closed()
175 return self._mode in (_MODE_READ, _MODE_READ_EOF)
176
177 def writable(self):
178 """Return whether the file was opened for writing."""
179 self._check_not_closed()
180 return self._mode == _MODE_WRITE
181
182 # Mode-checking helper functions.
183
184 def _check_not_closed(self):
185 if self.closed:
186 raise ValueError("I/O operation on closed file")
187
188 def _check_can_read(self):
189 if not self.readable():
190 raise io.UnsupportedOperation("File not open for reading")
191
192 def _check_can_write(self):
193 if not self.writable():
194 raise io.UnsupportedOperation("File not open for writing")
195
196 def _check_can_seek(self):
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200197 if not self.readable():
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200198 raise io.UnsupportedOperation("Seeking is only supported "
199 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200200 if not self._fp.seekable():
201 raise io.UnsupportedOperation("The underlying file object "
202 "does not support seeking")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200203
204 # Fill the readahead buffer if it is empty. Returns False on EOF.
205 def _fill_buffer(self):
206 if self._buffer:
207 return True
208
209 if self._decompressor.unused_data:
210 rawblock = self._decompressor.unused_data
211 else:
212 rawblock = self._fp.read(_BUFFER_SIZE)
213
214 if not rawblock:
215 if self._decompressor.eof:
216 self._mode = _MODE_READ_EOF
217 self._size = self._pos
218 return False
219 else:
220 raise EOFError("Compressed file ended before the "
221 "end-of-stream marker was reached")
222
223 # Continue to next stream.
224 if self._decompressor.eof:
225 self._decompressor = LZMADecompressor(**self._init_args)
226
227 self._buffer = self._decompressor.decompress(rawblock)
228 return True
229
230 # Read data until EOF.
231 # If return_data is false, consume the data without returning it.
232 def _read_all(self, return_data=True):
233 blocks = []
234 while self._fill_buffer():
235 if return_data:
236 blocks.append(self._buffer)
237 self._pos += len(self._buffer)
238 self._buffer = None
239 if return_data:
240 return b"".join(blocks)
241
242 # Read a block of up to n bytes.
243 # If return_data is false, consume the data without returning it.
244 def _read_block(self, n, return_data=True):
245 blocks = []
246 while n > 0 and self._fill_buffer():
247 if n < len(self._buffer):
248 data = self._buffer[:n]
249 self._buffer = self._buffer[n:]
250 else:
251 data = self._buffer
252 self._buffer = None
253 if return_data:
254 blocks.append(data)
255 self._pos += len(data)
256 n -= len(data)
257 if return_data:
258 return b"".join(blocks)
259
260 def peek(self, size=-1):
261 """Return buffered data without advancing the file position.
262
263 Always returns at least one byte of data, unless at EOF.
264 The exact number of bytes returned is unspecified.
265 """
266 self._check_can_read()
267 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
268 return b""
269 return self._buffer
270
271 def read(self, size=-1):
272 """Read up to size uncompressed bytes from the file.
273
274 If size is negative or omitted, read until EOF is reached.
275 Returns b"" if the file is already at EOF.
276 """
277 self._check_can_read()
278 if self._mode == _MODE_READ_EOF or size == 0:
279 return b""
280 elif size < 0:
281 return self._read_all()
282 else:
283 return self._read_block(size)
284
285 def read1(self, size=-1):
286 """Read up to size uncompressed bytes with at most one read
287 from the underlying stream.
288
289 Returns b"" if the file is at EOF.
290 """
291 self._check_can_read()
292 if (size == 0 or self._mode == _MODE_READ_EOF or
293 not self._fill_buffer()):
294 return b""
295 if 0 < size < len(self._buffer):
296 data = self._buffer[:size]
297 self._buffer = self._buffer[size:]
298 else:
299 data = self._buffer
300 self._buffer = None
301 self._pos += len(data)
302 return data
303
304 def write(self, data):
305 """Write a bytes object to the file.
306
307 Returns the number of uncompressed bytes written, which is
308 always len(data). Note that due to buffering, the file on disk
309 may not reflect the data written until close() is called.
310 """
311 self._check_can_write()
312 compressed = self._compressor.compress(data)
313 self._fp.write(compressed)
314 self._pos += len(data)
315 return len(data)
316
317 # Rewind the file to the beginning of the data stream.
318 def _rewind(self):
319 self._fp.seek(0, 0)
320 self._mode = _MODE_READ
321 self._pos = 0
322 self._decompressor = LZMADecompressor(**self._init_args)
323 self._buffer = None
324
325 def seek(self, offset, whence=0):
326 """Change the file position.
327
328 The new position is specified by offset, relative to the
329 position indicated by whence. Possible values for whence are:
330
331 0: start of stream (default): offset must not be negative
332 1: current stream position
333 2: end of stream; offset must not be positive
334
335 Returns the new file position.
336
337 Note that seeking is emulated, sp depending on the parameters,
338 this operation may be extremely slow.
339 """
340 self._check_can_seek()
341
342 # Recalculate offset as an absolute file position.
343 if whence == 0:
344 pass
345 elif whence == 1:
346 offset = self._pos + offset
347 elif whence == 2:
348 # Seeking relative to EOF - we need to know the file's size.
349 if self._size < 0:
350 self._read_all(return_data=False)
351 offset = self._size + offset
352 else:
353 raise ValueError("Invalid value for whence: {}".format(whence))
354
355 # Make it so that offset is the number of bytes to skip forward.
356 if offset < self._pos:
357 self._rewind()
358 else:
359 offset -= self._pos
360
361 # Read and discard data until we reach the desired position.
362 if self._mode != _MODE_READ_EOF:
363 self._read_block(offset, return_data=False)
364
365 return self._pos
366
367 def tell(self):
368 """Return the current file position."""
369 self._check_not_closed()
370 return self._pos
371
372
373def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
374 """Compress a block of data.
375
376 Refer to LZMACompressor's docstring for a description of the
377 optional arguments *format*, *check*, *preset* and *filters*.
378
379 For incremental compression, use an LZMACompressor object instead.
380 """
381 comp = LZMACompressor(format, check, preset, filters)
382 return comp.compress(data) + comp.flush()
383
384
385def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
386 """Decompress a block of data.
387
388 Refer to LZMADecompressor's docstring for a description of the
389 optional arguments *format*, *check* and *filters*.
390
391 For incremental decompression, use a LZMADecompressor object instead.
392 """
393 results = []
394 while True:
395 decomp = LZMADecompressor(format, memlimit, filters)
396 results.append(decomp.decompress(data))
397 if not decomp.eof:
398 raise LZMAError("Compressed data ended before the "
399 "end-of-stream marker was reached")
400 if not decomp.unused_data:
401 return b"".join(results)
402 # There is unused data left over. Proceed to next stream.
403 data = decomp.unused_data