blob: 5adf51f8de2ca30c6a4c93210cd836ce06f479fa [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
21 "compress", "decompress", "check_is_supported",
Nadeem Vawdaf55b3292012-05-06 23:01:27 +020022 "encode_filter_properties", "decode_filter_properties",
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020023]
24
25import io
26from _lzma import *
27
28
29_MODE_CLOSED = 0
30_MODE_READ = 1
31_MODE_READ_EOF = 2
32_MODE_WRITE = 3
33
34_BUFFER_SIZE = 8192
35
36
37class LZMAFile(io.BufferedIOBase):
38
39 """A file object providing transparent LZMA (de)compression.
40
41 An LZMAFile can act as a wrapper for an existing file object, or
42 refer directly to a named file on disk.
43
44 Note that LZMAFile provides a *binary* file interface - data read
45 is returned as bytes, and data to be written must be given as bytes.
46 """
47
48 def __init__(self, filename=None, mode="r", *,
49 fileobj=None, format=None, check=-1,
50 preset=None, filters=None):
51 """Open an LZMA-compressed file.
52
53 If filename is given, open the named file. Otherwise, operate on
54 the file object given by fileobj. Exactly one of these two
55 parameters should be provided.
56
57 mode can be "r" for reading (default), "w" for (over)writing, or
58 "a" for appending.
59
60 format specifies the container format to use for the file.
61 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
62 default is FORMAT_XZ.
63
64 check specifies the integrity check to use. This argument can
65 only be used when opening a file for writing. For FORMAT_XZ,
66 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
67 support integrity checks - for these formats, check must be
68 omitted, or be CHECK_NONE.
69
70 When opening a file for reading, the *preset* argument is not
71 meaningful, and should be omitted. The *filters* argument should
72 also be omitted, except when format is FORMAT_RAW (in which case
73 it is required).
74
75 When opening a file for writing, the settings used by the
76 compressor can be specified either as a preset compression
77 level (with the *preset* argument), or in detail as a custom
78 filter chain (with the *filters* argument). For FORMAT_XZ and
79 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
80 level. For FORMAT_RAW, the caller must always specify a filter
81 chain; the raw compressor does not support preset compression
82 levels.
83
84 preset (if provided) should be an integer in the range 0-9,
85 optionally OR-ed with the constant PRESET_EXTREME.
86
87 filters (if provided) should be a sequence of dicts. Each dict
88 should have an entry for "id" indicating ID of the filter, plus
89 additional entries for options to the filter.
90 """
91 self._fp = None
92 self._closefp = False
93 self._mode = _MODE_CLOSED
94 self._pos = 0
95 self._size = -1
96
97 if mode == "r":
98 if check != -1:
99 raise ValueError("Cannot specify an integrity check "
100 "when opening a file for reading")
101 if preset is not None:
102 raise ValueError("Cannot specify a preset compression "
103 "level when opening a file for reading")
104 if format is None:
105 format = FORMAT_AUTO
106 mode_code = _MODE_READ
107 # Save the args to pass to the LZMADecompressor initializer.
108 # If the file contains multiple compressed streams, each
109 # stream will need a separate decompressor object.
110 self._init_args = {"format":format, "filters":filters}
111 self._decompressor = LZMADecompressor(**self._init_args)
112 self._buffer = None
113 elif mode in ("w", "a"):
114 if format is None:
115 format = FORMAT_XZ
116 mode_code = _MODE_WRITE
117 self._compressor = LZMACompressor(format=format, check=check,
118 preset=preset, filters=filters)
119 else:
120 raise ValueError("Invalid mode: {!r}".format(mode))
121
122 if filename is not None and fileobj is None:
123 mode += "b"
124 self._fp = open(filename, mode)
125 self._closefp = True
126 self._mode = mode_code
127 elif fileobj is not None and filename is None:
128 self._fp = fileobj
129 self._mode = mode_code
130 else:
131 raise ValueError("Must give exactly one of filename and fileobj")
132
133 def close(self):
134 """Flush and close the file.
135
136 May be called more than once without error. Once the file is
137 closed, any other operation on it will raise a ValueError.
138 """
139 if self._mode == _MODE_CLOSED:
140 return
141 try:
142 if self._mode in (_MODE_READ, _MODE_READ_EOF):
143 self._decompressor = None
144 self._buffer = None
145 elif self._mode == _MODE_WRITE:
146 self._fp.write(self._compressor.flush())
147 self._compressor = None
148 finally:
149 try:
150 if self._closefp:
151 self._fp.close()
152 finally:
153 self._fp = None
154 self._closefp = False
155 self._mode = _MODE_CLOSED
156
157 @property
158 def closed(self):
159 """True if this file is closed."""
160 return self._mode == _MODE_CLOSED
161
162 def fileno(self):
163 """Return the file descriptor for the underlying file."""
164 self._check_not_closed()
165 return self._fp.fileno()
166
167 def seekable(self):
168 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200169 return self.readable() and self._fp.seekable()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200170
171 def readable(self):
172 """Return whether the file was opened for reading."""
173 self._check_not_closed()
174 return self._mode in (_MODE_READ, _MODE_READ_EOF)
175
176 def writable(self):
177 """Return whether the file was opened for writing."""
178 self._check_not_closed()
179 return self._mode == _MODE_WRITE
180
181 # Mode-checking helper functions.
182
183 def _check_not_closed(self):
184 if self.closed:
185 raise ValueError("I/O operation on closed file")
186
187 def _check_can_read(self):
188 if not self.readable():
189 raise io.UnsupportedOperation("File not open for reading")
190
191 def _check_can_write(self):
192 if not self.writable():
193 raise io.UnsupportedOperation("File not open for writing")
194
195 def _check_can_seek(self):
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200196 if not self.readable():
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200197 raise io.UnsupportedOperation("Seeking is only supported "
198 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200199 if not self._fp.seekable():
200 raise io.UnsupportedOperation("The underlying file object "
201 "does not support seeking")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200202
203 # Fill the readahead buffer if it is empty. Returns False on EOF.
204 def _fill_buffer(self):
205 if self._buffer:
206 return True
207
208 if self._decompressor.unused_data:
209 rawblock = self._decompressor.unused_data
210 else:
211 rawblock = self._fp.read(_BUFFER_SIZE)
212
213 if not rawblock:
214 if self._decompressor.eof:
215 self._mode = _MODE_READ_EOF
216 self._size = self._pos
217 return False
218 else:
219 raise EOFError("Compressed file ended before the "
220 "end-of-stream marker was reached")
221
222 # Continue to next stream.
223 if self._decompressor.eof:
224 self._decompressor = LZMADecompressor(**self._init_args)
225
226 self._buffer = self._decompressor.decompress(rawblock)
227 return True
228
229 # Read data until EOF.
230 # If return_data is false, consume the data without returning it.
231 def _read_all(self, return_data=True):
232 blocks = []
233 while self._fill_buffer():
234 if return_data:
235 blocks.append(self._buffer)
236 self._pos += len(self._buffer)
237 self._buffer = None
238 if return_data:
239 return b"".join(blocks)
240
241 # Read a block of up to n bytes.
242 # If return_data is false, consume the data without returning it.
243 def _read_block(self, n, return_data=True):
244 blocks = []
245 while n > 0 and self._fill_buffer():
246 if n < len(self._buffer):
247 data = self._buffer[:n]
248 self._buffer = self._buffer[n:]
249 else:
250 data = self._buffer
251 self._buffer = None
252 if return_data:
253 blocks.append(data)
254 self._pos += len(data)
255 n -= len(data)
256 if return_data:
257 return b"".join(blocks)
258
259 def peek(self, size=-1):
260 """Return buffered data without advancing the file position.
261
262 Always returns at least one byte of data, unless at EOF.
263 The exact number of bytes returned is unspecified.
264 """
265 self._check_can_read()
266 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
267 return b""
268 return self._buffer
269
270 def read(self, size=-1):
271 """Read up to size uncompressed bytes from the file.
272
273 If size is negative or omitted, read until EOF is reached.
274 Returns b"" if the file is already at EOF.
275 """
276 self._check_can_read()
277 if self._mode == _MODE_READ_EOF or size == 0:
278 return b""
279 elif size < 0:
280 return self._read_all()
281 else:
282 return self._read_block(size)
283
284 def read1(self, size=-1):
285 """Read up to size uncompressed bytes with at most one read
286 from the underlying stream.
287
288 Returns b"" if the file is at EOF.
289 """
290 self._check_can_read()
291 if (size == 0 or self._mode == _MODE_READ_EOF or
292 not self._fill_buffer()):
293 return b""
294 if 0 < size < len(self._buffer):
295 data = self._buffer[:size]
296 self._buffer = self._buffer[size:]
297 else:
298 data = self._buffer
299 self._buffer = None
300 self._pos += len(data)
301 return data
302
303 def write(self, data):
304 """Write a bytes object to the file.
305
306 Returns the number of uncompressed bytes written, which is
307 always len(data). Note that due to buffering, the file on disk
308 may not reflect the data written until close() is called.
309 """
310 self._check_can_write()
311 compressed = self._compressor.compress(data)
312 self._fp.write(compressed)
313 self._pos += len(data)
314 return len(data)
315
316 # Rewind the file to the beginning of the data stream.
317 def _rewind(self):
318 self._fp.seek(0, 0)
319 self._mode = _MODE_READ
320 self._pos = 0
321 self._decompressor = LZMADecompressor(**self._init_args)
322 self._buffer = None
323
324 def seek(self, offset, whence=0):
325 """Change the file position.
326
327 The new position is specified by offset, relative to the
328 position indicated by whence. Possible values for whence are:
329
330 0: start of stream (default): offset must not be negative
331 1: current stream position
332 2: end of stream; offset must not be positive
333
334 Returns the new file position.
335
336 Note that seeking is emulated, sp depending on the parameters,
337 this operation may be extremely slow.
338 """
339 self._check_can_seek()
340
341 # Recalculate offset as an absolute file position.
342 if whence == 0:
343 pass
344 elif whence == 1:
345 offset = self._pos + offset
346 elif whence == 2:
347 # Seeking relative to EOF - we need to know the file's size.
348 if self._size < 0:
349 self._read_all(return_data=False)
350 offset = self._size + offset
351 else:
352 raise ValueError("Invalid value for whence: {}".format(whence))
353
354 # Make it so that offset is the number of bytes to skip forward.
355 if offset < self._pos:
356 self._rewind()
357 else:
358 offset -= self._pos
359
360 # Read and discard data until we reach the desired position.
361 if self._mode != _MODE_READ_EOF:
362 self._read_block(offset, return_data=False)
363
364 return self._pos
365
366 def tell(self):
367 """Return the current file position."""
368 self._check_not_closed()
369 return self._pos
370
371
372def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
373 """Compress a block of data.
374
375 Refer to LZMACompressor's docstring for a description of the
376 optional arguments *format*, *check*, *preset* and *filters*.
377
378 For incremental compression, use an LZMACompressor object instead.
379 """
380 comp = LZMACompressor(format, check, preset, filters)
381 return comp.compress(data) + comp.flush()
382
383
384def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
385 """Decompress a block of data.
386
387 Refer to LZMADecompressor's docstring for a description of the
388 optional arguments *format*, *check* and *filters*.
389
390 For incremental decompression, use a LZMADecompressor object instead.
391 """
392 results = []
393 while True:
394 decomp = LZMADecompressor(format, memlimit, filters)
395 results.append(decomp.decompress(data))
396 if not decomp.eof:
397 raise LZMAError("Compressed data ended before the "
398 "end-of-stream marker was reached")
399 if not decomp.unused_data:
400 return b"".join(results)
401 # There is unused data left over. Proceed to next stream.
402 data = decomp.unused_data