blob: 9abf06d91db1848e1df95ecd29433f592098f514 [file] [log] [blame]
Nadeem Vawda3ff069e2011-11-30 00:25:06 +02001"""Interface to the liblzma compression library.
2
3This module provides a class for reading and writing compressed files,
4classes for incremental (de)compression, and convenience functions for
5one-shot (de)compression.
6
7These classes and functions support both the XZ and legacy LZMA
8container formats, as well as raw compressed data streams.
9"""
10
11__all__ = [
12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256",
13 "CHECK_ID_MAX", "CHECK_UNKNOWN",
14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64",
15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC",
16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW",
17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4",
18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME",
19
20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError",
Nadeem Vawdae8604042012-06-04 23:38:12 +020021 "open", "compress", "decompress", "is_check_supported",
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020022]
23
Nadeem Vawdae8604042012-06-04 23:38:12 +020024import builtins
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020025import io
Berker Peksag5f59ddd2016-10-04 20:41:20 +030026import os
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020027from _lzma import *
Nadeem Vawdaa425c3d2012-06-21 23:36:48 +020028from _lzma import _encode_filter_properties, _decode_filter_properties
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020029import _compression
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020030
31
32_MODE_CLOSED = 0
33_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020034# Value 2 no longer used
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020035_MODE_WRITE = 3
36
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020037
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020038class LZMAFile(_compression.BaseStream):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020039
40 """A file object providing transparent LZMA (de)compression.
41
42 An LZMAFile can act as a wrapper for an existing file object, or
43 refer directly to a named file on disk.
44
45 Note that LZMAFile provides a *binary* file interface - data read
46 is returned as bytes, and data to be written must be given as bytes.
47 """
48
49 def __init__(self, filename=None, mode="r", *,
Nadeem Vawda33c34da2012-06-04 23:34:07 +020050 format=None, check=-1, preset=None, filters=None):
51 """Open an LZMA-compressed file in binary mode.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020052
Berker Peksag5f59ddd2016-10-04 20:41:20 +030053 filename can be either an actual file name (given as a str,
54 bytes, or PathLike object), in which case the named file is
55 opened, or it can be an existing file object to read from or
56 write to.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020057
Nadeem Vawda42ca9822013-10-19 00:06:19 +020058 mode can be "r" for reading (default), "w" for (over)writing,
59 "x" for creating exclusively, or "a" for appending. These can
60 equivalently be given as "rb", "wb", "xb" and "ab" respectively.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020061
62 format specifies the container format to use for the file.
63 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the
64 default is FORMAT_XZ.
65
66 check specifies the integrity check to use. This argument can
67 only be used when opening a file for writing. For FORMAT_XZ,
68 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not
69 support integrity checks - for these formats, check must be
70 omitted, or be CHECK_NONE.
71
72 When opening a file for reading, the *preset* argument is not
73 meaningful, and should be omitted. The *filters* argument should
74 also be omitted, except when format is FORMAT_RAW (in which case
75 it is required).
76
77 When opening a file for writing, the settings used by the
78 compressor can be specified either as a preset compression
79 level (with the *preset* argument), or in detail as a custom
80 filter chain (with the *filters* argument). For FORMAT_XZ and
81 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset
82 level. For FORMAT_RAW, the caller must always specify a filter
83 chain; the raw compressor does not support preset compression
84 levels.
85
86 preset (if provided) should be an integer in the range 0-9,
87 optionally OR-ed with the constant PRESET_EXTREME.
88
89 filters (if provided) should be a sequence of dicts. Each dict
90 should have an entry for "id" indicating ID of the filter, plus
91 additional entries for options to the filter.
92 """
93 self._fp = None
94 self._closefp = False
95 self._mode = _MODE_CLOSED
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020096
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +020097 if mode in ("r", "rb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +020098 if check != -1:
99 raise ValueError("Cannot specify an integrity check "
100 "when opening a file for reading")
101 if preset is not None:
102 raise ValueError("Cannot specify a preset compression "
103 "level when opening a file for reading")
104 if format is None:
105 format = FORMAT_AUTO
106 mode_code = _MODE_READ
Nadeem Vawda42ca9822013-10-19 00:06:19 +0200107 elif mode in ("w", "wb", "a", "ab", "x", "xb"):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200108 if format is None:
109 format = FORMAT_XZ
110 mode_code = _MODE_WRITE
111 self._compressor = LZMACompressor(format=format, check=check,
112 preset=preset, filters=filters)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200113 self._pos = 0
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200114 else:
115 raise ValueError("Invalid mode: {!r}".format(mode))
116
Berker Peksag5f59ddd2016-10-04 20:41:20 +0300117 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda6cbb20c2012-06-04 23:36:24 +0200118 if "b" not in mode:
119 mode += "b"
Nadeem Vawdae8604042012-06-04 23:38:12 +0200120 self._fp = builtins.open(filename, mode)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200121 self._closefp = True
122 self._mode = mode_code
Nadeem Vawda33c34da2012-06-04 23:34:07 +0200123 elif hasattr(filename, "read") or hasattr(filename, "write"):
124 self._fp = filename
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200125 self._mode = mode_code
126 else:
Berker Peksag5f59ddd2016-10-04 20:41:20 +0300127 raise TypeError("filename must be a str, bytes, file or PathLike object")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200128
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200129 if self._mode == _MODE_READ:
130 raw = _compression.DecompressReader(self._fp, LZMADecompressor,
131 trailing_error=LZMAError, format=format, filters=filters)
132 self._buffer = io.BufferedReader(raw)
133
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200134 def close(self):
135 """Flush and close the file.
136
137 May be called more than once without error. Once the file is
138 closed, any other operation on it will raise a ValueError.
139 """
140 if self._mode == _MODE_CLOSED:
141 return
142 try:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200143 if self._mode == _MODE_READ:
144 self._buffer.close()
145 self._buffer = None
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200146 elif self._mode == _MODE_WRITE:
147 self._fp.write(self._compressor.flush())
148 self._compressor = None
149 finally:
150 try:
151 if self._closefp:
152 self._fp.close()
153 finally:
154 self._fp = None
155 self._closefp = False
156 self._mode = _MODE_CLOSED
157
158 @property
159 def closed(self):
160 """True if this file is closed."""
161 return self._mode == _MODE_CLOSED
162
163 def fileno(self):
164 """Return the file descriptor for the underlying file."""
165 self._check_not_closed()
166 return self._fp.fileno()
167
168 def seekable(self):
169 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200170 return self.readable() and self._buffer.seekable()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200171
172 def readable(self):
173 """Return whether the file was opened for reading."""
174 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200175 return self._mode == _MODE_READ
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200176
177 def writable(self):
178 """Return whether the file was opened for writing."""
179 self._check_not_closed()
180 return self._mode == _MODE_WRITE
181
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200182 def peek(self, size=-1):
183 """Return buffered data without advancing the file position.
184
185 Always returns at least one byte of data, unless at EOF.
186 The exact number of bytes returned is unspecified.
187 """
188 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200189 # Relies on the undocumented fact that BufferedReader.peek() always
190 # returns at least one byte (except at EOF)
191 return self._buffer.peek(size)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200192
193 def read(self, size=-1):
194 """Read up to size uncompressed bytes from the file.
195
196 If size is negative or omitted, read until EOF is reached.
197 Returns b"" if the file is already at EOF.
198 """
199 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200200 return self._buffer.read(size)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200201
202 def read1(self, size=-1):
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200203 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200204 making multiple reads from the underlying stream. Reads up to a
205 buffer's worth of data if size is negative.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200206
207 Returns b"" if the file is at EOF.
208 """
209 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200210 if size < 0:
211 size = io.DEFAULT_BUFFER_SIZE
212 return self._buffer.read1(size)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200213
Nadeem Vawda186370b2012-10-21 16:57:32 +0200214 def readline(self, size=-1):
215 """Read a line of uncompressed bytes from the file.
216
217 The terminating newline (if present) is retained. If size is
218 non-negative, no more than size bytes will be read (in which
219 case the line may be incomplete). Returns b'' if already at EOF.
220 """
221 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200222 return self._buffer.readline(size)
Nadeem Vawda186370b2012-10-21 16:57:32 +0200223
Inada Naokid2a8e692021-04-13 13:51:49 +0900224 def __iter__(self):
225 self._check_can_read()
226 return self._buffer.__iter__()
227
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200228 def write(self, data):
229 """Write a bytes object to the file.
230
231 Returns the number of uncompressed bytes written, which is
Miss Islington (bot)01858fb2021-06-22 06:59:53 -0700232 always the length of data in bytes. Note that due to buffering,
233 the file on disk may not reflect the data written until close()
234 is called.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200235 """
236 self._check_can_write()
Miss Islington (bot)01858fb2021-06-22 06:59:53 -0700237 if isinstance(data, (bytes, bytearray)):
238 length = len(data)
239 else:
240 # accept any data that supports the buffer protocol
241 data = memoryview(data)
242 length = data.nbytes
243
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200244 compressed = self._compressor.compress(data)
245 self._fp.write(compressed)
Miss Islington (bot)01858fb2021-06-22 06:59:53 -0700246 self._pos += length
247 return length
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200248
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200249 def seek(self, offset, whence=io.SEEK_SET):
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200250 """Change the file position.
251
252 The new position is specified by offset, relative to the
253 position indicated by whence. Possible values for whence are:
254
255 0: start of stream (default): offset must not be negative
256 1: current stream position
257 2: end of stream; offset must not be positive
258
259 Returns the new file position.
260
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200261 Note that seeking is emulated, so depending on the parameters,
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200262 this operation may be extremely slow.
263 """
264 self._check_can_seek()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200265 return self._buffer.seek(offset, whence)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200266
267 def tell(self):
268 """Return the current file position."""
269 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200270 if self._mode == _MODE_READ:
271 return self._buffer.tell()
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200272 return self._pos
273
274
Nadeem Vawdae8604042012-06-04 23:38:12 +0200275def open(filename, mode="rb", *,
276 format=None, check=-1, preset=None, filters=None,
277 encoding=None, errors=None, newline=None):
278 """Open an LZMA-compressed file in binary or text mode.
279
Berker Peksag5f59ddd2016-10-04 20:41:20 +0300280 filename can be either an actual file name (given as a str, bytes,
281 or PathLike object), in which case the named file is opened, or it
282 can be an existing file object to read from or write to.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200283
Nadeem Vawda42ca9822013-10-19 00:06:19 +0200284 The mode argument can be "r", "rb" (default), "w", "wb", "x", "xb",
285 "a", or "ab" for binary mode, or "rt", "wt", "xt", or "at" for text
286 mode.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200287
Nadeem Vawda801985e2012-10-13 04:26:49 +0200288 The format, check, preset and filters arguments specify the
289 compression settings, as for LZMACompressor, LZMADecompressor and
290 LZMAFile.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200291
Nadeem Vawda801985e2012-10-13 04:26:49 +0200292 For binary mode, this function is equivalent to the LZMAFile
293 constructor: LZMAFile(filename, mode, ...). In this case, the
294 encoding, errors and newline arguments must not be provided.
Nadeem Vawdae8604042012-06-04 23:38:12 +0200295
Serhiy Storchaka6a7b3a72016-04-17 08:32:47 +0300296 For text mode, an LZMAFile object is created, and wrapped in an
Nadeem Vawda801985e2012-10-13 04:26:49 +0200297 io.TextIOWrapper instance with the specified encoding, error
298 handling behavior, and line ending(s).
Nadeem Vawdae8604042012-06-04 23:38:12 +0200299
300 """
301 if "t" in mode:
302 if "b" in mode:
303 raise ValueError("Invalid mode: %r" % (mode,))
304 else:
305 if encoding is not None:
306 raise ValueError("Argument 'encoding' not supported in binary mode")
307 if errors is not None:
308 raise ValueError("Argument 'errors' not supported in binary mode")
309 if newline is not None:
310 raise ValueError("Argument 'newline' not supported in binary mode")
311
312 lz_mode = mode.replace("t", "")
313 binary_file = LZMAFile(filename, lz_mode, format=format, check=check,
314 preset=preset, filters=filters)
315
316 if "t" in mode:
Inada Naoki48274832021-03-29 12:28:14 +0900317 encoding = io.text_encoding(encoding)
Nadeem Vawdae8604042012-06-04 23:38:12 +0200318 return io.TextIOWrapper(binary_file, encoding, errors, newline)
319 else:
320 return binary_file
321
322
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200323def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None):
324 """Compress a block of data.
325
326 Refer to LZMACompressor's docstring for a description of the
327 optional arguments *format*, *check*, *preset* and *filters*.
328
Nadeem Vawda801985e2012-10-13 04:26:49 +0200329 For incremental compression, use an LZMACompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200330 """
331 comp = LZMACompressor(format, check, preset, filters)
332 return comp.compress(data) + comp.flush()
333
334
335def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None):
336 """Decompress a block of data.
337
338 Refer to LZMADecompressor's docstring for a description of the
339 optional arguments *format*, *check* and *filters*.
340
Nadeem Vawda801985e2012-10-13 04:26:49 +0200341 For incremental decompression, use an LZMADecompressor instead.
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200342 """
343 results = []
344 while True:
345 decomp = LZMADecompressor(format, memlimit, filters)
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100346 try:
347 res = decomp.decompress(data)
348 except LZMAError:
349 if results:
350 break # Leftover data is not a valid LZMA/XZ stream; ignore it.
351 else:
352 raise # Error on the first iteration; bail out.
353 results.append(res)
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200354 if not decomp.eof:
355 raise LZMAError("Compressed data ended before the "
356 "end-of-stream marker was reached")
Nadeem Vawda3ff069e2011-11-30 00:25:06 +0200357 data = decomp.unused_data
Nadeem Vawda9c72ebc2013-12-04 23:03:49 +0100358 if not data:
359 break
360 return b"".join(results)