blob: 3924aaed1678aa56cab6ab446e98ca9b3c432cb6 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020012from builtins import open as _builtin_open
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Berker Peksag8bdd4482016-10-02 20:07:06 +030014import os
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020015import warnings
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020016import _compression
Antoine Pitroua6a4dc82017-09-07 18:56:24 +020017from threading import RLock
Nadeem Vawda72750a82012-01-18 01:57:14 +020018
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020019from _bz2 import BZ2Compressor, BZ2Decompressor
20
21
22_MODE_CLOSED = 0
23_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020024# Value 2 no longer used
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020025_MODE_WRITE = 3
26
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020027
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020028class BZ2File(_compression.BaseStream):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020029
30 """A file object providing transparent bzip2 (de)compression.
31
32 A BZ2File can act as a wrapper for an existing file object, or refer
33 directly to a named file on disk.
34
35 Note that BZ2File provides a *binary* file interface - data read is
36 returned as bytes, and data to be written should be given as bytes.
37 """
38
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020039 def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020040 """Open a bzip2-compressed file.
41
Berker Peksag8bdd4482016-10-02 20:07:06 +030042 If filename is a str, bytes, or PathLike object, it gives the
43 name of the file to be opened. Otherwise, it should be a file
44 object, which will be used to read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020045
Nadeem Vawda4907b0a2012-10-08 20:31:34 +020046 mode can be 'r' for reading (default), 'w' for (over)writing,
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020047 'x' for creating exclusively, or 'a' for appending. These can
48 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020049
50 buffering is ignored. Its use is deprecated.
51
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020052 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
Nadeem Vawdacac89092012-02-04 13:08:11 +020053 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020054 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020055
56 If mode is 'r', the input file may be the concatenation of
57 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020058 """
59 # This lock must be recursive, so that BufferedIOBase's
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020060 # writelines() does not deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020061 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020062 self._fp = None
63 self._closefp = False
64 self._mode = _MODE_CLOSED
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065
66 if buffering is not None:
67 warnings.warn("Use of 'buffering' argument is deprecated",
68 DeprecationWarning)
69
70 if not (1 <= compresslevel <= 9):
71 raise ValueError("compresslevel must be between 1 and 9")
72
73 if mode in ("", "r", "rb"):
74 mode = "rb"
75 mode_code = _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020076 elif mode in ("w", "wb"):
77 mode = "wb"
78 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020079 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020080 elif mode in ("x", "xb"):
81 mode = "xb"
82 mode_code = _MODE_WRITE
83 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020084 elif mode in ("a", "ab"):
85 mode = "ab"
86 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020087 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020088 else:
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020089 raise ValueError("Invalid mode: %r" % (mode,))
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020090
Berker Peksag8bdd4482016-10-02 20:07:06 +030091 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020092 self._fp = _builtin_open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020093 self._closefp = True
94 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020095 elif hasattr(filename, "read") or hasattr(filename, "write"):
96 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020097 self._mode = mode_code
98 else:
Berker Peksag8bdd4482016-10-02 20:07:06 +030099 raise TypeError("filename must be a str, bytes, file or PathLike object")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200100
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200101 if self._mode == _MODE_READ:
102 raw = _compression.DecompressReader(self._fp,
103 BZ2Decompressor, trailing_error=OSError)
104 self._buffer = io.BufferedReader(raw)
105 else:
106 self._pos = 0
107
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200108 def close(self):
109 """Flush and close the file.
110
111 May be called more than once without error. Once the file is
112 closed, any other operation on it will raise a ValueError.
113 """
114 with self._lock:
115 if self._mode == _MODE_CLOSED:
116 return
117 try:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200118 if self._mode == _MODE_READ:
119 self._buffer.close()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200120 elif self._mode == _MODE_WRITE:
121 self._fp.write(self._compressor.flush())
122 self._compressor = None
123 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200124 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200125 if self._closefp:
126 self._fp.close()
127 finally:
128 self._fp = None
129 self._closefp = False
130 self._mode = _MODE_CLOSED
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200131 self._buffer = None
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200132
133 @property
134 def closed(self):
135 """True if this file is closed."""
136 return self._mode == _MODE_CLOSED
137
138 def fileno(self):
139 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200140 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200141 return self._fp.fileno()
142
143 def seekable(self):
144 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200145 return self.readable() and self._buffer.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200146
147 def readable(self):
148 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200149 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200150 return self._mode == _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200151
152 def writable(self):
153 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200154 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200155 return self._mode == _MODE_WRITE
156
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200157 def peek(self, n=0):
158 """Return buffered data without advancing the file position.
159
160 Always returns at least one byte of data, unless at EOF.
161 The exact number of bytes returned is unspecified.
162 """
163 with self._lock:
164 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200165 # Relies on the undocumented fact that BufferedReader.peek()
166 # always returns at least one byte (except at EOF), independent
167 # of the value of n
168 return self._buffer.peek(n)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200169
170 def read(self, size=-1):
171 """Read up to size uncompressed bytes from the file.
172
173 If size is negative or omitted, read until EOF is reached.
174 Returns b'' if the file is already at EOF.
175 """
176 with self._lock:
177 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200178 return self._buffer.read(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200179
180 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200181 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200182 making multiple reads from the underlying stream. Reads up to a
183 buffer's worth of data if size is negative.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200184
185 Returns b'' if the file is at EOF.
186 """
187 with self._lock:
188 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200189 if size < 0:
190 size = io.DEFAULT_BUFFER_SIZE
191 return self._buffer.read1(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200192
193 def readinto(self, b):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200194 """Read bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200195
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200196 Returns the number of bytes read (0 for EOF).
197 """
198 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200199 self._check_can_read()
200 return self._buffer.readinto(b)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200201
202 def readline(self, size=-1):
203 """Read a line of uncompressed bytes from the file.
204
205 The terminating newline (if present) is retained. If size is
206 non-negative, no more than size bytes will be read (in which
207 case the line may be incomplete). Returns b'' if already at EOF.
208 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200209 if not isinstance(size, int):
210 if not hasattr(size, "__index__"):
211 raise TypeError("Integer argument expected")
212 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200213 with self._lock:
Nadeem Vawda138ad502012-10-01 23:04:11 +0200214 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200215 return self._buffer.readline(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200216
217 def readlines(self, size=-1):
218 """Read a list of lines of uncompressed bytes from the file.
219
220 size can be specified to control the number of lines read: no
221 further lines will be read once the total size of the lines read
222 so far equals or exceeds size.
223 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200224 if not isinstance(size, int):
225 if not hasattr(size, "__index__"):
226 raise TypeError("Integer argument expected")
227 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200228 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200229 self._check_can_read()
230 return self._buffer.readlines(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200231
232 def write(self, data):
233 """Write a byte string to the file.
234
235 Returns the number of uncompressed bytes written, which is
236 always len(data). Note that due to buffering, the file on disk
237 may not reflect the data written until close() is called.
238 """
239 with self._lock:
240 self._check_can_write()
241 compressed = self._compressor.compress(data)
242 self._fp.write(compressed)
243 self._pos += len(data)
244 return len(data)
245
246 def writelines(self, seq):
247 """Write a sequence of byte strings to the file.
248
249 Returns the number of uncompressed bytes written.
250 seq can be any iterable yielding byte strings.
251
252 Line separators are not added between the written byte strings.
253 """
254 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200255 return _compression.BaseStream.writelines(self, seq)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200256
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200257 def seek(self, offset, whence=io.SEEK_SET):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200258 """Change the file position.
259
260 The new position is specified by offset, relative to the
261 position indicated by whence. Values for whence are:
262
263 0: start of stream (default); offset must not be negative
264 1: current stream position
265 2: end of stream; offset must not be positive
266
267 Returns the new file position.
268
269 Note that seeking is emulated, so depending on the parameters,
270 this operation may be extremely slow.
271 """
272 with self._lock:
273 self._check_can_seek()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200274 return self._buffer.seek(offset, whence)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200275
276 def tell(self):
277 """Return the current file position."""
278 with self._lock:
279 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200280 if self._mode == _MODE_READ:
281 return self._buffer.tell()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200282 return self._pos
283
284
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200285def open(filename, mode="rb", compresslevel=9,
286 encoding=None, errors=None, newline=None):
287 """Open a bzip2-compressed file in binary or text mode.
288
Berker Peksag8bdd4482016-10-02 20:07:06 +0300289 The filename argument can be an actual filename (a str, bytes, or
290 PathLike object), or an existing file object to read from or write
291 to.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200292
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +0200293 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
294 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
295 The default mode is "rb", and the default compresslevel is 9.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200296
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200297 For binary mode, this function is equivalent to the BZ2File
298 constructor: BZ2File(filename, mode, compresslevel). In this case,
299 the encoding, errors and newline arguments must not be provided.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200300
301 For text mode, a BZ2File object is created, and wrapped in an
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200302 io.TextIOWrapper instance with the specified encoding, error
303 handling behavior, and line ending(s).
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200304
305 """
306 if "t" in mode:
307 if "b" in mode:
308 raise ValueError("Invalid mode: %r" % (mode,))
309 else:
310 if encoding is not None:
311 raise ValueError("Argument 'encoding' not supported in binary mode")
312 if errors is not None:
313 raise ValueError("Argument 'errors' not supported in binary mode")
314 if newline is not None:
315 raise ValueError("Argument 'newline' not supported in binary mode")
316
317 bz_mode = mode.replace("t", "")
318 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
319
320 if "t" in mode:
321 return io.TextIOWrapper(binary_file, encoding, errors, newline)
322 else:
323 return binary_file
324
325
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200326def compress(data, compresslevel=9):
327 """Compress a block of data.
328
329 compresslevel, if given, must be a number between 1 and 9.
330
331 For incremental compression, use a BZ2Compressor object instead.
332 """
333 comp = BZ2Compressor(compresslevel)
334 return comp.compress(data) + comp.flush()
335
336
337def decompress(data):
338 """Decompress a block of data.
339
340 For incremental decompression, use a BZ2Decompressor object instead.
341 """
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200342 results = []
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100343 while data:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200344 decomp = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100345 try:
346 res = decomp.decompress(data)
347 except OSError:
348 if results:
349 break # Leftover data is not a valid bzip2 stream; ignore it.
350 else:
351 raise # Error on the first iteration; bail out.
352 results.append(res)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200353 if not decomp.eof:
354 raise ValueError("Compressed data ended before the "
355 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200356 data = decomp.unused_data
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100357 return b"".join(results)