blob: 7dff1c319a6c251f27c302ce9ee8aa3440ffc714 [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
Nadeem Vawdae8604042012-06-04 23:38:12 +020021 "open", "compress", "decompress", "is_check_supported",
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020022]
23
Nadeem Vawdae8604042012-06-04 23:38:12 +020024import builtins
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020025import io
26from _lzma import *
Nadeem Vawdaa425c3d2012-06-21 23:36:48 +020027from _lzma import _encode_filter_properties, _decode_filter_properties
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020028import _compression
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020029
30
31_MODE_CLOSED = 0
32_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020033# Value 2 no longer used
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020034_MODE_WRITE = 3
35
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020036
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020037class LZMAFile(_compression.BaseStream):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020038
39 """A file object providing transparent LZMA (de)compression.
40
41 An LZMAFile can act as a wrapper for an existing file object, or
42 refer directly to a named file on disk.
43
44 Note that LZMAFile provides a *binary* file interface - data read
45 is returned as bytes, and data to be written must be given as bytes.
46 """
47
48 def __init__(self, filename=None, mode="r", *,
Nadeem Vawda33c34da2012-06-04 23:34:07 +020049 format=None, check=-1, preset=None, filters=None):
50 """Open an LZMA-compressed file in binary mode.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020051
Nadeem Vawda33c34da2012-06-04 23:34:07 +020052 filename can be either an actual file name (given as a str or
53 bytes object), in which case the named file is opened, or it can
54 be an existing file object to read from or write to.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020055
Nadeem Vawda42ca9822013-10-19 00:06:19 +020056 mode can be "r" for reading (default), "w" for (over)writing,
57 "x" for creating exclusively, or "a" for appending. These can
58 equivalently be given as "rb", "wb", "xb" and "ab" respectively.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020059
60 format specifies the container format to use for the file.
61 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
62 default is FORMAT_XZ.
63
64 check specifies the integrity check to use. This argument can
65 only be used when opening a file for writing. For FORMAT_XZ,
66 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
67 support integrity checks - for these formats, check must be
68 omitted, or be CHECK_NONE.
69
70 When opening a file for reading, the *preset* argument is not
71 meaningful, and should be omitted. The *filters* argument should
72 also be omitted, except when format is FORMAT_RAW (in which case
73 it is required).
74
75 When opening a file for writing, the settings used by the
76 compressor can be specified either as a preset compression
77 level (with the *preset* argument), or in detail as a custom
78 filter chain (with the *filters* argument). For FORMAT_XZ and
79 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
80 level. For FORMAT_RAW, the caller must always specify a filter
81 chain; the raw compressor does not support preset compression
82 levels.
83
84 preset (if provided) should be an integer in the range 0-9,
85 optionally OR-ed with the constant PRESET_EXTREME.
86
87 filters (if provided) should be a sequence of dicts. Each dict
88 should have an entry for "id" indicating ID of the filter, plus
89 additional entries for options to the filter.
90 """
91 self._fp = None
92 self._closefp = False
93 self._mode = _MODE_CLOSED
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020094
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020095 if mode in ("r", "rb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020096 if check != -1:
97 raise ValueError("Cannot specify an integrity check "
98 "when opening a file for reading")
99 if preset is not None:
100 raise ValueError("Cannot specify a preset compression "
101 "level when opening a file for reading")
102 if format is None:
103 format = FORMAT_AUTO
104 mode_code = _MODE_READ
Nadeem Vawda42ca9822013-10-19 00:06:19 +0200105 elif mode in ("w", "wb", "a", "ab", "x", "xb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200106 if format is None:
107 format = FORMAT_XZ
108 mode_code = _MODE_WRITE
109 self._compressor = LZMACompressor(format=format, check=check,
110 preset=preset, filters=filters)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200111 self._pos = 0
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200112 else:
113 raise ValueError("Invalid mode: {!r}".format(mode))
114
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200115 if isinstance(filename, (str, bytes)):
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200116 if "b" not in mode:
117 mode += "b"
Nadeem Vawdae8604042012-06-04 23:38:12 +0200118 self._fp = builtins.open(filename, mode)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200119 self._closefp = True
120 self._mode = mode_code
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200121 elif hasattr(filename, "read") or hasattr(filename, "write"):
122 self._fp = filename
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200123 self._mode = mode_code
124 else:
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200125 raise TypeError("filename must be a str or bytes object, or a file")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200126
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200127 if self._mode == _MODE_READ:
128 raw = _compression.DecompressReader(self._fp, LZMADecompressor,
129 trailing_error=LZMAError, format=format, filters=filters)
130 self._buffer = io.BufferedReader(raw)
131
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200132 def close(self):
133 """Flush and close the file.
134
135 May be called more than once without error. Once the file is
136 closed, any other operation on it will raise a ValueError.
137 """
138 if self._mode == _MODE_CLOSED:
139 return
140 try:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200141 if self._mode == _MODE_READ:
142 self._buffer.close()
143 self._buffer = None
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200144 elif self._mode == _MODE_WRITE:
145 self._fp.write(self._compressor.flush())
146 self._compressor = None
147 finally:
148 try:
149 if self._closefp:
150 self._fp.close()
151 finally:
152 self._fp = None
153 self._closefp = False
154 self._mode = _MODE_CLOSED
155
156 @property
157 def closed(self):
158 """True if this file is closed."""
159 return self._mode == _MODE_CLOSED
160
161 def fileno(self):
162 """Return the file descriptor for the underlying file."""
163 self._check_not_closed()
164 return self._fp.fileno()
165
166 def seekable(self):
167 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200168 return self.readable() and self._buffer.seekable()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200169
170 def readable(self):
171 """Return whether the file was opened for reading."""
172 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200173 return self._mode == _MODE_READ
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200174
175 def writable(self):
176 """Return whether the file was opened for writing."""
177 self._check_not_closed()
178 return self._mode == _MODE_WRITE
179
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200180 def peek(self, size=-1):
181 """Return buffered data without advancing the file position.
182
183 Always returns at least one byte of data, unless at EOF.
184 The exact number of bytes returned is unspecified.
185 """
186 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200187 # Relies on the undocumented fact that BufferedReader.peek() always
188 # returns at least one byte (except at EOF)
189 return self._buffer.peek(size)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200190
191 def read(self, size=-1):
192 """Read up to size uncompressed bytes from the file.
193
194 If size is negative or omitted, read until EOF is reached.
195 Returns b"" if the file is already at EOF.
196 """
197 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200198 return self._buffer.read(size)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200199
200 def read1(self, size=-1):
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200201 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200202 making multiple reads from the underlying stream. Reads up to a
203 buffer's worth of data if size is negative.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200204
205 Returns b"" if the file is at EOF.
206 """
207 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200208 if size < 0:
209 size = io.DEFAULT_BUFFER_SIZE
210 return self._buffer.read1(size)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200211
Nadeem Vawda186370b2012-10-21 16:57:32 +0200212 def readline(self, size=-1):
213 """Read a line of uncompressed bytes from the file.
214
215 The terminating newline (if present) is retained. If size is
216 non-negative, no more than size bytes will be read (in which
217 case the line may be incomplete). Returns b'' if already at EOF.
218 """
219 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200220 return self._buffer.readline(size)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200221
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200222 def write(self, data):
223 """Write a bytes object to the file.
224
225 Returns the number of uncompressed bytes written, which is
226 always len(data). Note that due to buffering, the file on disk
227 may not reflect the data written until close() is called.
228 """
229 self._check_can_write()
230 compressed = self._compressor.compress(data)
231 self._fp.write(compressed)
232 self._pos += len(data)
233 return len(data)
234
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200235 def seek(self, offset, whence=io.SEEK_SET):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200236 """Change the file position.
237
238 The new position is specified by offset, relative to the
239 position indicated by whence. Possible values for whence are:
240
241 0: start of stream (default): offset must not be negative
242 1: current stream position
243 2: end of stream; offset must not be positive
244
245 Returns the new file position.
246
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200247 Note that seeking is emulated, so depending on the parameters,
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200248 this operation may be extremely slow.
249 """
250 self._check_can_seek()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200251 return self._buffer.seek(offset, whence)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200252
253 def tell(self):
254 """Return the current file position."""
255 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200256 if self._mode == _MODE_READ:
257 return self._buffer.tell()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200258 return self._pos
259
260
Nadeem Vawdae8604042012-06-04 23:38:12 +0200261def open(filename, mode="rb", *,
262 format=None, check=-1, preset=None, filters=None,
263 encoding=None, errors=None, newline=None):
264 """Open an LZMA-compressed file in binary or text mode.
265
Nadeem Vawda801985e2012-10-13 04:26:49 +0200266 filename can be either an actual file name (given as a str or bytes
267 object), in which case the named file is opened, or it can be an
268 existing file object to read from or write to.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200269
Nadeem Vawda42ca9822013-10-19 00:06:19 +0200270 The mode argument can be "r", "rb" (default), "w", "wb", "x", "xb",
271 "a", or "ab" for binary mode, or "rt", "wt", "xt", or "at" for text
272 mode.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200273
Nadeem Vawda801985e2012-10-13 04:26:49 +0200274 The format, check, preset and filters arguments specify the
275 compression settings, as for LZMACompressor, LZMADecompressor and
276 LZMAFile.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200277
Nadeem Vawda801985e2012-10-13 04:26:49 +0200278 For binary mode, this function is equivalent to the LZMAFile
279 constructor: LZMAFile(filename, mode, ...). In this case, the
280 encoding, errors and newline arguments must not be provided.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200281
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300282 For text mode, an LZMAFile object is created, and wrapped in an
Nadeem Vawda801985e2012-10-13 04:26:49 +0200283 io.TextIOWrapper instance with the specified encoding, error
284 handling behavior, and line ending(s).
Nadeem Vawdae8604042012-06-04 23:38:12 +0200285
286 """
287 if "t" in mode:
288 if "b" in mode:
289 raise ValueError("Invalid mode: %r" % (mode,))
290 else:
291 if encoding is not None:
292 raise ValueError("Argument 'encoding' not supported in binary mode")
293 if errors is not None:
294 raise ValueError("Argument 'errors' not supported in binary mode")
295 if newline is not None:
296 raise ValueError("Argument 'newline' not supported in binary mode")
297
298 lz_mode = mode.replace("t", "")
299 binary_file = LZMAFile(filename, lz_mode, format=format, check=check,
300 preset=preset, filters=filters)
301
302 if "t" in mode:
303 return io.TextIOWrapper(binary_file, encoding, errors, newline)
304 else:
305 return binary_file
306
307
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200308def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
309 """Compress a block of data.
310
311 Refer to LZMACompressor's docstring for a description of the
312 optional arguments *format*, *check*, *preset* and *filters*.
313
Nadeem Vawda801985e2012-10-13 04:26:49 +0200314 For incremental compression, use an LZMACompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200315 """
316 comp = LZMACompressor(format, check, preset, filters)
317 return comp.compress(data) + comp.flush()
318
319
320def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
321 """Decompress a block of data.
322
323 Refer to LZMADecompressor's docstring for a description of the
324 optional arguments *format*, *check* and *filters*.
325
Nadeem Vawda801985e2012-10-13 04:26:49 +0200326 For incremental decompression, use an LZMADecompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200327 """
328 results = []
329 while True:
330 decomp = LZMADecompressor(format, memlimit, filters)
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100331 try:
332 res = decomp.decompress(data)
333 except LZMAError:
334 if results:
335 break # Leftover data is not a valid LZMA/XZ stream; ignore it.
336 else:
337 raise # Error on the first iteration; bail out.
338 results.append(res)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200339 if not decomp.eof:
340 raise LZMAError("Compressed data ended before the "
341 "end-of-stream marker was reached")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200342 data = decomp.unused_data
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100343 if not data:
344 break
345 return b"".join(results)