blob: 3786993ccff0f80cc89043b112b66126c6ff6ac2 [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
21 "compress", "decompress", "check_is_supported",
22]
23
24import io
25from _lzma import *
26
27
28_MODE_CLOSED = 0
29_MODE_READ = 1
30_MODE_READ_EOF = 2
31_MODE_WRITE = 3
32
33_BUFFER_SIZE = 8192
34
35
36class LZMAFile(io.BufferedIOBase):
37
38 """A file object providing transparent LZMA (de)compression.
39
40 An LZMAFile can act as a wrapper for an existing file object, or
41 refer directly to a named file on disk.
42
43 Note that LZMAFile provides a *binary* file interface - data read
44 is returned as bytes, and data to be written must be given as bytes.
45 """
46
47 def __init__(self, filename=None, mode="r", *,
48 fileobj=None, format=None, check=-1,
49 preset=None, filters=None):
50 """Open an LZMA-compressed file.
51
52 If filename is given, open the named file. Otherwise, operate on
53 the file object given by fileobj. Exactly one of these two
54 parameters should be provided.
55
56 mode can be "r" for reading (default), "w" for (over)writing, or
57 "a" for appending.
58
59 format specifies the container format to use for the file.
60 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
61 default is FORMAT_XZ.
62
63 check specifies the integrity check to use. This argument can
64 only be used when opening a file for writing. For FORMAT_XZ,
65 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
66 support integrity checks - for these formats, check must be
67 omitted, or be CHECK_NONE.
68
69 When opening a file for reading, the *preset* argument is not
70 meaningful, and should be omitted. The *filters* argument should
71 also be omitted, except when format is FORMAT_RAW (in which case
72 it is required).
73
74 When opening a file for writing, the settings used by the
75 compressor can be specified either as a preset compression
76 level (with the *preset* argument), or in detail as a custom
77 filter chain (with the *filters* argument). For FORMAT_XZ and
78 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
79 level. For FORMAT_RAW, the caller must always specify a filter
80 chain; the raw compressor does not support preset compression
81 levels.
82
83 preset (if provided) should be an integer in the range 0-9,
84 optionally OR-ed with the constant PRESET_EXTREME.
85
86 filters (if provided) should be a sequence of dicts. Each dict
87 should have an entry for "id" indicating ID of the filter, plus
88 additional entries for options to the filter.
89 """
90 self._fp = None
91 self._closefp = False
92 self._mode = _MODE_CLOSED
93 self._pos = 0
94 self._size = -1
95
96 if mode == "r":
97 if check != -1:
98 raise ValueError("Cannot specify an integrity check "
99 "when opening a file for reading")
100 if preset is not None:
101 raise ValueError("Cannot specify a preset compression "
102 "level when opening a file for reading")
103 if format is None:
104 format = FORMAT_AUTO
105 mode_code = _MODE_READ
106 # Save the args to pass to the LZMADecompressor initializer.
107 # If the file contains multiple compressed streams, each
108 # stream will need a separate decompressor object.
109 self._init_args = {"format":format, "filters":filters}
110 self._decompressor = LZMADecompressor(**self._init_args)
111 self._buffer = None
112 elif mode in ("w", "a"):
113 if format is None:
114 format = FORMAT_XZ
115 mode_code = _MODE_WRITE
116 self._compressor = LZMACompressor(format=format, check=check,
117 preset=preset, filters=filters)
118 else:
119 raise ValueError("Invalid mode: {!r}".format(mode))
120
121 if filename is not None and fileobj is None:
122 mode += "b"
123 self._fp = open(filename, mode)
124 self._closefp = True
125 self._mode = mode_code
126 elif fileobj is not None and filename is None:
127 self._fp = fileobj
128 self._mode = mode_code
129 else:
130 raise ValueError("Must give exactly one of filename and fileobj")
131
132 def close(self):
133 """Flush and close the file.
134
135 May be called more than once without error. Once the file is
136 closed, any other operation on it will raise a ValueError.
137 """
138 if self._mode == _MODE_CLOSED:
139 return
140 try:
141 if self._mode in (_MODE_READ, _MODE_READ_EOF):
142 self._decompressor = None
143 self._buffer = None
144 elif self._mode == _MODE_WRITE:
145 self._fp.write(self._compressor.flush())
146 self._compressor = None
147 finally:
148 try:
149 if self._closefp:
150 self._fp.close()
151 finally:
152 self._fp = None
153 self._closefp = False
154 self._mode = _MODE_CLOSED
155
156 @property
157 def closed(self):
158 """True if this file is closed."""
159 return self._mode == _MODE_CLOSED
160
161 def fileno(self):
162 """Return the file descriptor for the underlying file."""
163 self._check_not_closed()
164 return self._fp.fileno()
165
166 def seekable(self):
167 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200168 return self.readable() and self._fp.seekable()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200169
170 def readable(self):
171 """Return whether the file was opened for reading."""
172 self._check_not_closed()
173 return self._mode in (_MODE_READ, _MODE_READ_EOF)
174
175 def writable(self):
176 """Return whether the file was opened for writing."""
177 self._check_not_closed()
178 return self._mode == _MODE_WRITE
179
180 # Mode-checking helper functions.
181
182 def _check_not_closed(self):
183 if self.closed:
184 raise ValueError("I/O operation on closed file")
185
186 def _check_can_read(self):
187 if not self.readable():
188 raise io.UnsupportedOperation("File not open for reading")
189
190 def _check_can_write(self):
191 if not self.writable():
192 raise io.UnsupportedOperation("File not open for writing")
193
194 def _check_can_seek(self):
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200195 if not self.readable():
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200196 raise io.UnsupportedOperation("Seeking is only supported "
197 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200198 if not self._fp.seekable():
199 raise io.UnsupportedOperation("The underlying file object "
200 "does not support seeking")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200201
202 # Fill the readahead buffer if it is empty. Returns False on EOF.
203 def _fill_buffer(self):
204 if self._buffer:
205 return True
206
207 if self._decompressor.unused_data:
208 rawblock = self._decompressor.unused_data
209 else:
210 rawblock = self._fp.read(_BUFFER_SIZE)
211
212 if not rawblock:
213 if self._decompressor.eof:
214 self._mode = _MODE_READ_EOF
215 self._size = self._pos
216 return False
217 else:
218 raise EOFError("Compressed file ended before the "
219 "end-of-stream marker was reached")
220
221 # Continue to next stream.
222 if self._decompressor.eof:
223 self._decompressor = LZMADecompressor(**self._init_args)
224
225 self._buffer = self._decompressor.decompress(rawblock)
226 return True
227
228 # Read data until EOF.
229 # If return_data is false, consume the data without returning it.
230 def _read_all(self, return_data=True):
231 blocks = []
232 while self._fill_buffer():
233 if return_data:
234 blocks.append(self._buffer)
235 self._pos += len(self._buffer)
236 self._buffer = None
237 if return_data:
238 return b"".join(blocks)
239
240 # Read a block of up to n bytes.
241 # If return_data is false, consume the data without returning it.
242 def _read_block(self, n, return_data=True):
243 blocks = []
244 while n > 0 and self._fill_buffer():
245 if n < len(self._buffer):
246 data = self._buffer[:n]
247 self._buffer = self._buffer[n:]
248 else:
249 data = self._buffer
250 self._buffer = None
251 if return_data:
252 blocks.append(data)
253 self._pos += len(data)
254 n -= len(data)
255 if return_data:
256 return b"".join(blocks)
257
258 def peek(self, size=-1):
259 """Return buffered data without advancing the file position.
260
261 Always returns at least one byte of data, unless at EOF.
262 The exact number of bytes returned is unspecified.
263 """
264 self._check_can_read()
265 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
266 return b""
267 return self._buffer
268
269 def read(self, size=-1):
270 """Read up to size uncompressed bytes from the file.
271
272 If size is negative or omitted, read until EOF is reached.
273 Returns b"" if the file is already at EOF.
274 """
275 self._check_can_read()
276 if self._mode == _MODE_READ_EOF or size == 0:
277 return b""
278 elif size < 0:
279 return self._read_all()
280 else:
281 return self._read_block(size)
282
283 def read1(self, size=-1):
284 """Read up to size uncompressed bytes with at most one read
285 from the underlying stream.
286
287 Returns b"" if the file is at EOF.
288 """
289 self._check_can_read()
290 if (size == 0 or self._mode == _MODE_READ_EOF or
291 not self._fill_buffer()):
292 return b""
293 if 0 < size < len(self._buffer):
294 data = self._buffer[:size]
295 self._buffer = self._buffer[size:]
296 else:
297 data = self._buffer
298 self._buffer = None
299 self._pos += len(data)
300 return data
301
302 def write(self, data):
303 """Write a bytes object to the file.
304
305 Returns the number of uncompressed bytes written, which is
306 always len(data). Note that due to buffering, the file on disk
307 may not reflect the data written until close() is called.
308 """
309 self._check_can_write()
310 compressed = self._compressor.compress(data)
311 self._fp.write(compressed)
312 self._pos += len(data)
313 return len(data)
314
315 # Rewind the file to the beginning of the data stream.
316 def _rewind(self):
317 self._fp.seek(0, 0)
318 self._mode = _MODE_READ
319 self._pos = 0
320 self._decompressor = LZMADecompressor(**self._init_args)
321 self._buffer = None
322
323 def seek(self, offset, whence=0):
324 """Change the file position.
325
326 The new position is specified by offset, relative to the
327 position indicated by whence. Possible values for whence are:
328
329 0: start of stream (default): offset must not be negative
330 1: current stream position
331 2: end of stream; offset must not be positive
332
333 Returns the new file position.
334
335 Note that seeking is emulated, sp depending on the parameters,
336 this operation may be extremely slow.
337 """
338 self._check_can_seek()
339
340 # Recalculate offset as an absolute file position.
341 if whence == 0:
342 pass
343 elif whence == 1:
344 offset = self._pos + offset
345 elif whence == 2:
346 # Seeking relative to EOF - we need to know the file's size.
347 if self._size < 0:
348 self._read_all(return_data=False)
349 offset = self._size + offset
350 else:
351 raise ValueError("Invalid value for whence: {}".format(whence))
352
353 # Make it so that offset is the number of bytes to skip forward.
354 if offset < self._pos:
355 self._rewind()
356 else:
357 offset -= self._pos
358
359 # Read and discard data until we reach the desired position.
360 if self._mode != _MODE_READ_EOF:
361 self._read_block(offset, return_data=False)
362
363 return self._pos
364
365 def tell(self):
366 """Return the current file position."""
367 self._check_not_closed()
368 return self._pos
369
370
371def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
372 """Compress a block of data.
373
374 Refer to LZMACompressor's docstring for a description of the
375 optional arguments *format*, *check*, *preset* and *filters*.
376
377 For incremental compression, use an LZMACompressor object instead.
378 """
379 comp = LZMACompressor(format, check, preset, filters)
380 return comp.compress(data) + comp.flush()
381
382
383def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
384 """Decompress a block of data.
385
386 Refer to LZMADecompressor's docstring for a description of the
387 optional arguments *format*, *check* and *filters*.
388
389 For incremental decompression, use a LZMADecompressor object instead.
390 """
391 results = []
392 while True:
393 decomp = LZMADecompressor(format, memlimit, filters)
394 results.append(decomp.decompress(data))
395 if not decomp.eof:
396 raise LZMAError("Compressed data ended before the "
397 "end-of-stream marker was reached")
398 if not decomp.unused_data:
399 return b"".join(results)
400 # There is unused data left over. Proceed to next stream.
401 data = decomp.unused_data