blob: 0817b872d2019f2198b17e382cdc9beb3899ac80 [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
Nadeem Vawdae8604042012-06-04 23:38:12 +020021 "open", "compress", "decompress", "is_check_supported",
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020022]
23
Nadeem Vawdae8604042012-06-04 23:38:12 +020024import builtins
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020025import io
Berker Peksag5f59ddd2016-10-04 20:41:20 +030026import os
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020027from _lzma import *
Nadeem Vawdaa425c3d2012-06-21 23:36:48 +020028from _lzma import _encode_filter_properties, _decode_filter_properties
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020029import _compression
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020030
31
32_MODE_CLOSED = 0
33_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020034# Value 2 no longer used
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020035_MODE_WRITE = 3
36
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020037
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020038class LZMAFile(_compression.BaseStream):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020039
40 """A file object providing transparent LZMA (de)compression.
41
42 An LZMAFile can act as a wrapper for an existing file object, or
43 refer directly to a named file on disk.
44
45 Note that LZMAFile provides a *binary* file interface - data read
46 is returned as bytes, and data to be written must be given as bytes.
47 """
48
49 def __init__(self, filename=None, mode="r", *,
Nadeem Vawda33c34da2012-06-04 23:34:07 +020050 format=None, check=-1, preset=None, filters=None):
51 """Open an LZMA-compressed file in binary mode.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020052
Berker Peksag5f59ddd2016-10-04 20:41:20 +030053 filename can be either an actual file name (given as a str,
54 bytes, or PathLike object), in which case the named file is
55 opened, or it can be an existing file object to read from or
56 write to.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020057
Nadeem Vawda42ca9822013-10-19 00:06:19 +020058 mode can be "r" for reading (default), "w" for (over)writing,
59 "x" for creating exclusively, or "a" for appending. These can
60 equivalently be given as "rb", "wb", "xb" and "ab" respectively.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020061
62 format specifies the container format to use for the file.
63 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
64 default is FORMAT_XZ.
65
66 check specifies the integrity check to use. This argument can
67 only be used when opening a file for writing. For FORMAT_XZ,
68 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
69 support integrity checks - for these formats, check must be
70 omitted, or be CHECK_NONE.
71
72 When opening a file for reading, the *preset* argument is not
73 meaningful, and should be omitted. The *filters* argument should
74 also be omitted, except when format is FORMAT_RAW (in which case
75 it is required).
76
77 When opening a file for writing, the settings used by the
78 compressor can be specified either as a preset compression
79 level (with the *preset* argument), or in detail as a custom
80 filter chain (with the *filters* argument). For FORMAT_XZ and
81 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
82 level. For FORMAT_RAW, the caller must always specify a filter
83 chain; the raw compressor does not support preset compression
84 levels.
85
86 preset (if provided) should be an integer in the range 0-9,
87 optionally OR-ed with the constant PRESET_EXTREME.
88
89 filters (if provided) should be a sequence of dicts. Each dict
90 should have an entry for "id" indicating ID of the filter, plus
91 additional entries for options to the filter.
92 """
93 self._fp = None
94 self._closefp = False
95 self._mode = _MODE_CLOSED
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020096
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020097 if mode in ("r", "rb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020098 if check != -1:
99 raise ValueError("Cannot specify an integrity check "
100 "when opening a file for reading")
101 if preset is not None:
102 raise ValueError("Cannot specify a preset compression "
103 "level when opening a file for reading")
104 if format is None:
105 format = FORMAT_AUTO
106 mode_code = _MODE_READ
Nadeem Vawda42ca9822013-10-19 00:06:19 +0200107 elif mode in ("w", "wb", "a", "ab", "x", "xb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200108 if format is None:
109 format = FORMAT_XZ
110 mode_code = _MODE_WRITE
111 self._compressor = LZMACompressor(format=format, check=check,
112 preset=preset, filters=filters)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200113 self._pos = 0
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200114 else:
115 raise ValueError("Invalid mode: {!r}".format(mode))
116
Berker Peksag5f59ddd2016-10-04 20:41:20 +0300117 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200118 if "b" not in mode:
119 mode += "b"
Nadeem Vawdae8604042012-06-04 23:38:12 +0200120 self._fp = builtins.open(filename, mode)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200121 self._closefp = True
122 self._mode = mode_code
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200123 elif hasattr(filename, "read") or hasattr(filename, "write"):
124 self._fp = filename
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200125 self._mode = mode_code
126 else:
Berker Peksag5f59ddd2016-10-04 20:41:20 +0300127 raise TypeError("filename must be a str, bytes, file or PathLike object")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200128
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200129 if self._mode == _MODE_READ:
130 raw = _compression.DecompressReader(self._fp, LZMADecompressor,
131 trailing_error=LZMAError, format=format, filters=filters)
132 self._buffer = io.BufferedReader(raw)
133
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200134 def close(self):
135 """Flush and close the file.
136
137 May be called more than once without error. Once the file is
138 closed, any other operation on it will raise a ValueError.
139 """
140 if self._mode == _MODE_CLOSED:
141 return
142 try:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200143 if self._mode == _MODE_READ:
144 self._buffer.close()
145 self._buffer = None
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200146 elif self._mode == _MODE_WRITE:
147 self._fp.write(self._compressor.flush())
148 self._compressor = None
149 finally:
150 try:
151 if self._closefp:
152 self._fp.close()
153 finally:
154 self._fp = None
155 self._closefp = False
156 self._mode = _MODE_CLOSED
157
158 @property
159 def closed(self):
160 """True if this file is closed."""
161 return self._mode == _MODE_CLOSED
162
163 def fileno(self):
164 """Return the file descriptor for the underlying file."""
165 self._check_not_closed()
166 return self._fp.fileno()
167
168 def seekable(self):
169 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200170 return self.readable() and self._buffer.seekable()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200171
172 def readable(self):
173 """Return whether the file was opened for reading."""
174 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200175 return self._mode == _MODE_READ
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200176
177 def writable(self):
178 """Return whether the file was opened for writing."""
179 self._check_not_closed()
180 return self._mode == _MODE_WRITE
181
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200182 def peek(self, size=-1):
183 """Return buffered data without advancing the file position.
184
185 Always returns at least one byte of data, unless at EOF.
186 The exact number of bytes returned is unspecified.
187 """
188 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200189 # Relies on the undocumented fact that BufferedReader.peek() always
190 # returns at least one byte (except at EOF)
191 return self._buffer.peek(size)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200192
193 def read(self, size=-1):
194 """Read up to size uncompressed bytes from the file.
195
196 If size is negative or omitted, read until EOF is reached.
197 Returns b"" if the file is already at EOF.
198 """
199 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200200 return self._buffer.read(size)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200201
202 def read1(self, size=-1):
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200203 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200204 making multiple reads from the underlying stream. Reads up to a
205 buffer's worth of data if size is negative.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200206
207 Returns b"" if the file is at EOF.
208 """
209 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200210 if size < 0:
211 size = io.DEFAULT_BUFFER_SIZE
212 return self._buffer.read1(size)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200213
Nadeem Vawda186370b2012-10-21 16:57:32 +0200214 def readline(self, size=-1):
215 """Read a line of uncompressed bytes from the file.
216
217 The terminating newline (if present) is retained. If size is
218 non-negative, no more than size bytes will be read (in which
219 case the line may be incomplete). Returns b'' if already at EOF.
220 """
221 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200222 return self._buffer.readline(size)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200223
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200224 def write(self, data):
225 """Write a bytes object to the file.
226
227 Returns the number of uncompressed bytes written, which is
228 always len(data). Note that due to buffering, the file on disk
229 may not reflect the data written until close() is called.
230 """
231 self._check_can_write()
232 compressed = self._compressor.compress(data)
233 self._fp.write(compressed)
234 self._pos += len(data)
235 return len(data)
236
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200237 def seek(self, offset, whence=io.SEEK_SET):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200238 """Change the file position.
239
240 The new position is specified by offset, relative to the
241 position indicated by whence. Possible values for whence are:
242
243 0: start of stream (default): offset must not be negative
244 1: current stream position
245 2: end of stream; offset must not be positive
246
247 Returns the new file position.
248
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200249 Note that seeking is emulated, so depending on the parameters,
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200250 this operation may be extremely slow.
251 """
252 self._check_can_seek()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200253 return self._buffer.seek(offset, whence)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200254
255 def tell(self):
256 """Return the current file position."""
257 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200258 if self._mode == _MODE_READ:
259 return self._buffer.tell()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200260 return self._pos
261
262
Nadeem Vawdae8604042012-06-04 23:38:12 +0200263def open(filename, mode="rb", *,
264 format=None, check=-1, preset=None, filters=None,
265 encoding=None, errors=None, newline=None):
266 """Open an LZMA-compressed file in binary or text mode.
267
Berker Peksag5f59ddd2016-10-04 20:41:20 +0300268 filename can be either an actual file name (given as a str, bytes,
269 or PathLike object), in which case the named file is opened, or it
270 can be an existing file object to read from or write to.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200271
Nadeem Vawda42ca9822013-10-19 00:06:19 +0200272 The mode argument can be "r", "rb" (default), "w", "wb", "x", "xb",
273 "a", or "ab" for binary mode, or "rt", "wt", "xt", or "at" for text
274 mode.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200275
Nadeem Vawda801985e2012-10-13 04:26:49 +0200276 The format, check, preset and filters arguments specify the
277 compression settings, as for LZMACompressor, LZMADecompressor and
278 LZMAFile.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200279
Nadeem Vawda801985e2012-10-13 04:26:49 +0200280 For binary mode, this function is equivalent to the LZMAFile
281 constructor: LZMAFile(filename, mode, ...). In this case, the
282 encoding, errors and newline arguments must not be provided.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200283
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300284 For text mode, an LZMAFile object is created, and wrapped in an
Nadeem Vawda801985e2012-10-13 04:26:49 +0200285 io.TextIOWrapper instance with the specified encoding, error
286 handling behavior, and line ending(s).
Nadeem Vawdae8604042012-06-04 23:38:12 +0200287
288 """
289 if "t" in mode:
290 if "b" in mode:
291 raise ValueError("Invalid mode: %r" % (mode,))
292 else:
293 if encoding is not None:
294 raise ValueError("Argument 'encoding' not supported in binary mode")
295 if errors is not None:
296 raise ValueError("Argument 'errors' not supported in binary mode")
297 if newline is not None:
298 raise ValueError("Argument 'newline' not supported in binary mode")
299
300 lz_mode = mode.replace("t", "")
301 binary_file = LZMAFile(filename, lz_mode, format=format, check=check,
302 preset=preset, filters=filters)
303
304 if "t" in mode:
305 return io.TextIOWrapper(binary_file, encoding, errors, newline)
306 else:
307 return binary_file
308
309
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200310def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
311 """Compress a block of data.
312
313 Refer to LZMACompressor's docstring for a description of the
314 optional arguments *format*, *check*, *preset* and *filters*.
315
Nadeem Vawda801985e2012-10-13 04:26:49 +0200316 For incremental compression, use an LZMACompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200317 """
318 comp = LZMACompressor(format, check, preset, filters)
319 return comp.compress(data) + comp.flush()
320
321
322def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
323 """Decompress a block of data.
324
325 Refer to LZMADecompressor's docstring for a description of the
326 optional arguments *format*, *check* and *filters*.
327
Nadeem Vawda801985e2012-10-13 04:26:49 +0200328 For incremental decompression, use an LZMADecompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200329 """
330 results = []
331 while True:
332 decomp = LZMADecompressor(format, memlimit, filters)
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100333 try:
334 res = decomp.decompress(data)
335 except LZMAError:
336 if results:
337 break # Leftover data is not a valid LZMA/XZ stream; ignore it.
338 else:
339 raise # Error on the first iteration; bail out.
340 results.append(res)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200341 if not decomp.eof:
342 raise LZMAError("Compressed data ended before the "
343 "end-of-stream marker was reached")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200344 data = decomp.unused_data
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100345 if not data:
346 break
347 return b"".join(results)