blob: 780c666eebf9be9dc0793b9d99cf931f0320e627 [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
21 "compress", "decompress", "check_is_supported",
22]
23
24import io
25from _lzma import *
26
27
28_MODE_CLOSED = 0
29_MODE_READ = 1
30_MODE_READ_EOF = 2
31_MODE_WRITE = 3
32
33_BUFFER_SIZE = 8192
34
35
36class LZMAFile(io.BufferedIOBase):
37
38 """A file object providing transparent LZMA (de)compression.
39
40 An LZMAFile can act as a wrapper for an existing file object, or
41 refer directly to a named file on disk.
42
43 Note that LZMAFile provides a *binary* file interface - data read
44 is returned as bytes, and data to be written must be given as bytes.
45 """
46
47 def __init__(self, filename=None, mode="r", *,
48 fileobj=None, format=None, check=-1,
49 preset=None, filters=None):
50 """Open an LZMA-compressed file.
51
52 If filename is given, open the named file. Otherwise, operate on
53 the file object given by fileobj. Exactly one of these two
54 parameters should be provided.
55
56 mode can be "r" for reading (default), "w" for (over)writing, or
57 "a" for appending.
58
59 format specifies the container format to use for the file.
60 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
61 default is FORMAT_XZ.
62
63 check specifies the integrity check to use. This argument can
64 only be used when opening a file for writing. For FORMAT_XZ,
65 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
66 support integrity checks - for these formats, check must be
67 omitted, or be CHECK_NONE.
68
69 When opening a file for reading, the *preset* argument is not
70 meaningful, and should be omitted. The *filters* argument should
71 also be omitted, except when format is FORMAT_RAW (in which case
72 it is required).
73
74 When opening a file for writing, the settings used by the
75 compressor can be specified either as a preset compression
76 level (with the *preset* argument), or in detail as a custom
77 filter chain (with the *filters* argument). For FORMAT_XZ and
78 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
79 level. For FORMAT_RAW, the caller must always specify a filter
80 chain; the raw compressor does not support preset compression
81 levels.
82
83 preset (if provided) should be an integer in the range 0-9,
84 optionally OR-ed with the constant PRESET_EXTREME.
85
86 filters (if provided) should be a sequence of dicts. Each dict
87 should have an entry for "id" indicating ID of the filter, plus
88 additional entries for options to the filter.
89 """
90 self._fp = None
91 self._closefp = False
92 self._mode = _MODE_CLOSED
93 self._pos = 0
94 self._size = -1
95
96 if mode == "r":
97 if check != -1:
98 raise ValueError("Cannot specify an integrity check "
99 "when opening a file for reading")
100 if preset is not None:
101 raise ValueError("Cannot specify a preset compression "
102 "level when opening a file for reading")
103 if format is None:
104 format = FORMAT_AUTO
105 mode_code = _MODE_READ
106 # Save the args to pass to the LZMADecompressor initializer.
107 # If the file contains multiple compressed streams, each
108 # stream will need a separate decompressor object.
109 self._init_args = {"format":format, "filters":filters}
110 self._decompressor = LZMADecompressor(**self._init_args)
111 self._buffer = None
112 elif mode in ("w", "a"):
113 if format is None:
114 format = FORMAT_XZ
115 mode_code = _MODE_WRITE
116 self._compressor = LZMACompressor(format=format, check=check,
117 preset=preset, filters=filters)
118 else:
119 raise ValueError("Invalid mode: {!r}".format(mode))
120
121 if filename is not None and fileobj is None:
122 mode += "b"
123 self._fp = open(filename, mode)
124 self._closefp = True
125 self._mode = mode_code
126 elif fileobj is not None and filename is None:
127 self._fp = fileobj
128 self._mode = mode_code
129 else:
130 raise ValueError("Must give exactly one of filename and fileobj")
131
132 def close(self):
133 """Flush and close the file.
134
135 May be called more than once without error. Once the file is
136 closed, any other operation on it will raise a ValueError.
137 """
138 if self._mode == _MODE_CLOSED:
139 return
140 try:
141 if self._mode in (_MODE_READ, _MODE_READ_EOF):
142 self._decompressor = None
143 self._buffer = None
144 elif self._mode == _MODE_WRITE:
145 self._fp.write(self._compressor.flush())
146 self._compressor = None
147 finally:
148 try:
149 if self._closefp:
150 self._fp.close()
151 finally:
152 self._fp = None
153 self._closefp = False
154 self._mode = _MODE_CLOSED
155
156 @property
157 def closed(self):
158 """True if this file is closed."""
159 return self._mode == _MODE_CLOSED
160
161 def fileno(self):
162 """Return the file descriptor for the underlying file."""
163 self._check_not_closed()
164 return self._fp.fileno()
165
166 def seekable(self):
167 """Return whether the file supports seeking."""
168 return self.readable()
169
170 def readable(self):
171 """Return whether the file was opened for reading."""
172 self._check_not_closed()
173 return self._mode in (_MODE_READ, _MODE_READ_EOF)
174
175 def writable(self):
176 """Return whether the file was opened for writing."""
177 self._check_not_closed()
178 return self._mode == _MODE_WRITE
179
180 # Mode-checking helper functions.
181
182 def _check_not_closed(self):
183 if self.closed:
184 raise ValueError("I/O operation on closed file")
185
186 def _check_can_read(self):
187 if not self.readable():
188 raise io.UnsupportedOperation("File not open for reading")
189
190 def _check_can_write(self):
191 if not self.writable():
192 raise io.UnsupportedOperation("File not open for writing")
193
194 def _check_can_seek(self):
195 if not self.seekable():
196 raise io.UnsupportedOperation("Seeking is only supported "
197 "on files open for reading")
198
199 # Fill the readahead buffer if it is empty. Returns False on EOF.
200 def _fill_buffer(self):
201 if self._buffer:
202 return True
203
204 if self._decompressor.unused_data:
205 rawblock = self._decompressor.unused_data
206 else:
207 rawblock = self._fp.read(_BUFFER_SIZE)
208
209 if not rawblock:
210 if self._decompressor.eof:
211 self._mode = _MODE_READ_EOF
212 self._size = self._pos
213 return False
214 else:
215 raise EOFError("Compressed file ended before the "
216 "end-of-stream marker was reached")
217
218 # Continue to next stream.
219 if self._decompressor.eof:
220 self._decompressor = LZMADecompressor(**self._init_args)
221
222 self._buffer = self._decompressor.decompress(rawblock)
223 return True
224
225 # Read data until EOF.
226 # If return_data is false, consume the data without returning it.
227 def _read_all(self, return_data=True):
228 blocks = []
229 while self._fill_buffer():
230 if return_data:
231 blocks.append(self._buffer)
232 self._pos += len(self._buffer)
233 self._buffer = None
234 if return_data:
235 return b"".join(blocks)
236
237 # Read a block of up to n bytes.
238 # If return_data is false, consume the data without returning it.
239 def _read_block(self, n, return_data=True):
240 blocks = []
241 while n > 0 and self._fill_buffer():
242 if n < len(self._buffer):
243 data = self._buffer[:n]
244 self._buffer = self._buffer[n:]
245 else:
246 data = self._buffer
247 self._buffer = None
248 if return_data:
249 blocks.append(data)
250 self._pos += len(data)
251 n -= len(data)
252 if return_data:
253 return b"".join(blocks)
254
255 def peek(self, size=-1):
256 """Return buffered data without advancing the file position.
257
258 Always returns at least one byte of data, unless at EOF.
259 The exact number of bytes returned is unspecified.
260 """
261 self._check_can_read()
262 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
263 return b""
264 return self._buffer
265
266 def read(self, size=-1):
267 """Read up to size uncompressed bytes from the file.
268
269 If size is negative or omitted, read until EOF is reached.
270 Returns b"" if the file is already at EOF.
271 """
272 self._check_can_read()
273 if self._mode == _MODE_READ_EOF or size == 0:
274 return b""
275 elif size < 0:
276 return self._read_all()
277 else:
278 return self._read_block(size)
279
280 def read1(self, size=-1):
281 """Read up to size uncompressed bytes with at most one read
282 from the underlying stream.
283
284 Returns b"" if the file is at EOF.
285 """
286 self._check_can_read()
287 if (size == 0 or self._mode == _MODE_READ_EOF or
288 not self._fill_buffer()):
289 return b""
290 if 0 < size < len(self._buffer):
291 data = self._buffer[:size]
292 self._buffer = self._buffer[size:]
293 else:
294 data = self._buffer
295 self._buffer = None
296 self._pos += len(data)
297 return data
298
299 def write(self, data):
300 """Write a bytes object to the file.
301
302 Returns the number of uncompressed bytes written, which is
303 always len(data). Note that due to buffering, the file on disk
304 may not reflect the data written until close() is called.
305 """
306 self._check_can_write()
307 compressed = self._compressor.compress(data)
308 self._fp.write(compressed)
309 self._pos += len(data)
310 return len(data)
311
312 # Rewind the file to the beginning of the data stream.
313 def _rewind(self):
314 self._fp.seek(0, 0)
315 self._mode = _MODE_READ
316 self._pos = 0
317 self._decompressor = LZMADecompressor(**self._init_args)
318 self._buffer = None
319
320 def seek(self, offset, whence=0):
321 """Change the file position.
322
323 The new position is specified by offset, relative to the
324 position indicated by whence. Possible values for whence are:
325
326 0: start of stream (default): offset must not be negative
327 1: current stream position
328 2: end of stream; offset must not be positive
329
330 Returns the new file position.
331
332 Note that seeking is emulated, sp depending on the parameters,
333 this operation may be extremely slow.
334 """
335 self._check_can_seek()
336
337 # Recalculate offset as an absolute file position.
338 if whence == 0:
339 pass
340 elif whence == 1:
341 offset = self._pos + offset
342 elif whence == 2:
343 # Seeking relative to EOF - we need to know the file's size.
344 if self._size < 0:
345 self._read_all(return_data=False)
346 offset = self._size + offset
347 else:
348 raise ValueError("Invalid value for whence: {}".format(whence))
349
350 # Make it so that offset is the number of bytes to skip forward.
351 if offset < self._pos:
352 self._rewind()
353 else:
354 offset -= self._pos
355
356 # Read and discard data until we reach the desired position.
357 if self._mode != _MODE_READ_EOF:
358 self._read_block(offset, return_data=False)
359
360 return self._pos
361
362 def tell(self):
363 """Return the current file position."""
364 self._check_not_closed()
365 return self._pos
366
367
368def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
369 """Compress a block of data.
370
371 Refer to LZMACompressor's docstring for a description of the
372 optional arguments *format*, *check*, *preset* and *filters*.
373
374 For incremental compression, use an LZMACompressor object instead.
375 """
376 comp = LZMACompressor(format, check, preset, filters)
377 return comp.compress(data) + comp.flush()
378
379
380def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
381 """Decompress a block of data.
382
383 Refer to LZMADecompressor's docstring for a description of the
384 optional arguments *format*, *check* and *filters*.
385
386 For incremental decompression, use a LZMADecompressor object instead.
387 """
388 results = []
389 while True:
390 decomp = LZMADecompressor(format, memlimit, filters)
391 results.append(decomp.decompress(data))
392 if not decomp.eof:
393 raise LZMAError("Compressed data ended before the "
394 "end-of-stream marker was reached")
395 if not decomp.unused_data:
396 return b"".join(results)
397 # There is unused data left over. Proceed to next stream.
398 data = decomp.unused_data