blob: ce07ebeb142d926aa54b124eca8623b8c483eeb4 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020012from builtins import open as _builtin_open
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Berker Peksag8bdd4482016-10-02 20:07:06 +030014import os
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020015import _compression
Antoine Pitroua6a4dc82017-09-07 18:56:24 +020016from threading import RLock
Nadeem Vawda72750a82012-01-18 01:57:14 +020017
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020018from _bz2 import BZ2Compressor, BZ2Decompressor
19
20
21_MODE_CLOSED = 0
22_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020023# Value 2 no longer used
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020024_MODE_WRITE = 3
25
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020026
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020027class BZ2File(_compression.BaseStream):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020028
29 """A file object providing transparent bzip2 (de)compression.
30
31 A BZ2File can act as a wrapper for an existing file object, or refer
32 directly to a named file on disk.
33
34 Note that BZ2File provides a *binary* file interface - data read is
35 returned as bytes, and data to be written should be given as bytes.
36 """
37
Victor Stinner9baf2422020-01-16 15:33:30 +010038 def __init__(self, filename, mode="r", *, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020039 """Open a bzip2-compressed file.
40
Berker Peksag8bdd4482016-10-02 20:07:06 +030041 If filename is a str, bytes, or PathLike object, it gives the
42 name of the file to be opened. Otherwise, it should be a file
43 object, which will be used to read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020044
Nadeem Vawda4907b0a2012-10-08 20:31:34 +020045 mode can be 'r' for reading (default), 'w' for (over)writing,
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020046 'x' for creating exclusively, or 'a' for appending. These can
47 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020048
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020049 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
Nadeem Vawdacac89092012-02-04 13:08:11 +020050 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020051 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020052
53 If mode is 'r', the input file may be the concatenation of
54 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020055 """
56 # This lock must be recursive, so that BufferedIOBase's
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020057 # writelines() does not deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020058 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020059 self._fp = None
60 self._closefp = False
61 self._mode = _MODE_CLOSED
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020062
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020063 if not (1 <= compresslevel <= 9):
64 raise ValueError("compresslevel must be between 1 and 9")
65
66 if mode in ("", "r", "rb"):
67 mode = "rb"
68 mode_code = _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020069 elif mode in ("w", "wb"):
70 mode = "wb"
71 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020072 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020073 elif mode in ("x", "xb"):
74 mode = "xb"
75 mode_code = _MODE_WRITE
76 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020077 elif mode in ("a", "ab"):
78 mode = "ab"
79 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020080 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020081 else:
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020082 raise ValueError("Invalid mode: %r" % (mode,))
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020083
Berker Peksag8bdd4482016-10-02 20:07:06 +030084 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020085 self._fp = _builtin_open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020086 self._closefp = True
87 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020088 elif hasattr(filename, "read") or hasattr(filename, "write"):
89 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020090 self._mode = mode_code
91 else:
Berker Peksag8bdd4482016-10-02 20:07:06 +030092 raise TypeError("filename must be a str, bytes, file or PathLike object")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020093
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020094 if self._mode == _MODE_READ:
95 raw = _compression.DecompressReader(self._fp,
96 BZ2Decompressor, trailing_error=OSError)
97 self._buffer = io.BufferedReader(raw)
98 else:
99 self._pos = 0
100
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200101 def close(self):
102 """Flush and close the file.
103
104 May be called more than once without error. Once the file is
105 closed, any other operation on it will raise a ValueError.
106 """
107 with self._lock:
108 if self._mode == _MODE_CLOSED:
109 return
110 try:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200111 if self._mode == _MODE_READ:
112 self._buffer.close()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200113 elif self._mode == _MODE_WRITE:
114 self._fp.write(self._compressor.flush())
115 self._compressor = None
116 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200117 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200118 if self._closefp:
119 self._fp.close()
120 finally:
121 self._fp = None
122 self._closefp = False
123 self._mode = _MODE_CLOSED
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200124 self._buffer = None
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200125
126 @property
127 def closed(self):
128 """True if this file is closed."""
129 return self._mode == _MODE_CLOSED
130
131 def fileno(self):
132 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200133 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200134 return self._fp.fileno()
135
136 def seekable(self):
137 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200138 return self.readable() and self._buffer.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200139
140 def readable(self):
141 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200142 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200143 return self._mode == _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200144
145 def writable(self):
146 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200147 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200148 return self._mode == _MODE_WRITE
149
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200150 def peek(self, n=0):
151 """Return buffered data without advancing the file position.
152
153 Always returns at least one byte of data, unless at EOF.
154 The exact number of bytes returned is unspecified.
155 """
156 with self._lock:
157 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200158 # Relies on the undocumented fact that BufferedReader.peek()
159 # always returns at least one byte (except at EOF), independent
160 # of the value of n
161 return self._buffer.peek(n)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200162
163 def read(self, size=-1):
164 """Read up to size uncompressed bytes from the file.
165
166 If size is negative or omitted, read until EOF is reached.
167 Returns b'' if the file is already at EOF.
168 """
169 with self._lock:
170 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200171 return self._buffer.read(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200172
173 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200174 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200175 making multiple reads from the underlying stream. Reads up to a
176 buffer's worth of data if size is negative.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200177
178 Returns b'' if the file is at EOF.
179 """
180 with self._lock:
181 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200182 if size < 0:
183 size = io.DEFAULT_BUFFER_SIZE
184 return self._buffer.read1(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200185
186 def readinto(self, b):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200187 """Read bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200188
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200189 Returns the number of bytes read (0 for EOF).
190 """
191 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200192 self._check_can_read()
193 return self._buffer.readinto(b)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200194
195 def readline(self, size=-1):
196 """Read a line of uncompressed bytes from the file.
197
198 The terminating newline (if present) is retained. If size is
199 non-negative, no more than size bytes will be read (in which
200 case the line may be incomplete). Returns b'' if already at EOF.
201 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200202 if not isinstance(size, int):
203 if not hasattr(size, "__index__"):
204 raise TypeError("Integer argument expected")
205 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200206 with self._lock:
Nadeem Vawda138ad502012-10-01 23:04:11 +0200207 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200208 return self._buffer.readline(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200209
210 def readlines(self, size=-1):
211 """Read a list of lines of uncompressed bytes from the file.
212
213 size can be specified to control the number of lines read: no
214 further lines will be read once the total size of the lines read
215 so far equals or exceeds size.
216 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200217 if not isinstance(size, int):
218 if not hasattr(size, "__index__"):
219 raise TypeError("Integer argument expected")
220 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200221 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200222 self._check_can_read()
223 return self._buffer.readlines(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200224
225 def write(self, data):
226 """Write a byte string to the file.
227
228 Returns the number of uncompressed bytes written, which is
229 always len(data). Note that due to buffering, the file on disk
230 may not reflect the data written until close() is called.
231 """
232 with self._lock:
233 self._check_can_write()
234 compressed = self._compressor.compress(data)
235 self._fp.write(compressed)
236 self._pos += len(data)
237 return len(data)
238
239 def writelines(self, seq):
240 """Write a sequence of byte strings to the file.
241
242 Returns the number of uncompressed bytes written.
243 seq can be any iterable yielding byte strings.
244
245 Line separators are not added between the written byte strings.
246 """
247 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200248 return _compression.BaseStream.writelines(self, seq)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200249
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200250 def seek(self, offset, whence=io.SEEK_SET):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200251 """Change the file position.
252
253 The new position is specified by offset, relative to the
254 position indicated by whence. Values for whence are:
255
256 0: start of stream (default); offset must not be negative
257 1: current stream position
258 2: end of stream; offset must not be positive
259
260 Returns the new file position.
261
262 Note that seeking is emulated, so depending on the parameters,
263 this operation may be extremely slow.
264 """
265 with self._lock:
266 self._check_can_seek()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200267 return self._buffer.seek(offset, whence)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200268
269 def tell(self):
270 """Return the current file position."""
271 with self._lock:
272 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200273 if self._mode == _MODE_READ:
274 return self._buffer.tell()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200275 return self._pos
276
277
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200278def open(filename, mode="rb", compresslevel=9,
279 encoding=None, errors=None, newline=None):
280 """Open a bzip2-compressed file in binary or text mode.
281
Berker Peksag8bdd4482016-10-02 20:07:06 +0300282 The filename argument can be an actual filename (a str, bytes, or
283 PathLike object), or an existing file object to read from or write
284 to.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200285
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +0200286 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
287 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
288 The default mode is "rb", and the default compresslevel is 9.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200289
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200290 For binary mode, this function is equivalent to the BZ2File
291 constructor: BZ2File(filename, mode, compresslevel). In this case,
292 the encoding, errors and newline arguments must not be provided.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200293
294 For text mode, a BZ2File object is created, and wrapped in an
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200295 io.TextIOWrapper instance with the specified encoding, error
296 handling behavior, and line ending(s).
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200297
298 """
299 if "t" in mode:
300 if "b" in mode:
301 raise ValueError("Invalid mode: %r" % (mode,))
302 else:
303 if encoding is not None:
304 raise ValueError("Argument 'encoding' not supported in binary mode")
305 if errors is not None:
306 raise ValueError("Argument 'errors' not supported in binary mode")
307 if newline is not None:
308 raise ValueError("Argument 'newline' not supported in binary mode")
309
310 bz_mode = mode.replace("t", "")
311 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
312
313 if "t" in mode:
314 return io.TextIOWrapper(binary_file, encoding, errors, newline)
315 else:
316 return binary_file
317
318
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200319def compress(data, compresslevel=9):
320 """Compress a block of data.
321
322 compresslevel, if given, must be a number between 1 and 9.
323
324 For incremental compression, use a BZ2Compressor object instead.
325 """
326 comp = BZ2Compressor(compresslevel)
327 return comp.compress(data) + comp.flush()
328
329
330def decompress(data):
331 """Decompress a block of data.
332
333 For incremental decompression, use a BZ2Decompressor object instead.
334 """
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200335 results = []
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100336 while data:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200337 decomp = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100338 try:
339 res = decomp.decompress(data)
340 except OSError:
341 if results:
342 break # Leftover data is not a valid bzip2 stream; ignore it.
343 else:
344 raise # Error on the first iteration; bail out.
345 results.append(res)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200346 if not decomp.eof:
347 raise ValueError("Compressed data ended before the "
348 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200349 data = decomp.unused_data
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100350 return b"".join(results)