blob: e094fbb548bc95ebdff96aff8e704a6c5c9e0708 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020012from builtins import open as _builtin_open
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Berker Peksag8bdd4482016-10-02 20:07:06 +030014import os
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020015import warnings
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020016import _compression
Antoine Pitroua6a4dc82017-09-07 18:56:24 +020017from threading import RLock
Nadeem Vawda72750a82012-01-18 01:57:14 +020018
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020019from _bz2 import BZ2Compressor, BZ2Decompressor
20
21
22_MODE_CLOSED = 0
23_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020024# Value 2 no longer used
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020025_MODE_WRITE = 3
26
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020027
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020028class BZ2File(_compression.BaseStream):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020029
30 """A file object providing transparent bzip2 (de)compression.
31
32 A BZ2File can act as a wrapper for an existing file object, or refer
33 directly to a named file on disk.
34
35 Note that BZ2File provides a *binary* file interface - data read is
36 returned as bytes, and data to be written should be given as bytes.
37 """
38
Victor Stinner9baf2422020-01-16 15:33:30 +010039 def __init__(self, filename, mode="r", *, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020040 """Open a bzip2-compressed file.
41
Berker Peksag8bdd4482016-10-02 20:07:06 +030042 If filename is a str, bytes, or PathLike object, it gives the
43 name of the file to be opened. Otherwise, it should be a file
44 object, which will be used to read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020045
Nadeem Vawda4907b0a2012-10-08 20:31:34 +020046 mode can be 'r' for reading (default), 'w' for (over)writing,
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020047 'x' for creating exclusively, or 'a' for appending. These can
48 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020049
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020050 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
Nadeem Vawdacac89092012-02-04 13:08:11 +020051 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020052 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020053
54 If mode is 'r', the input file may be the concatenation of
55 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020056 """
57 # This lock must be recursive, so that BufferedIOBase's
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020058 # writelines() does not deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020059 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020060 self._fp = None
61 self._closefp = False
62 self._mode = _MODE_CLOSED
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020063
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020064 if not (1 <= compresslevel <= 9):
65 raise ValueError("compresslevel must be between 1 and 9")
66
67 if mode in ("", "r", "rb"):
68 mode = "rb"
69 mode_code = _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020070 elif mode in ("w", "wb"):
71 mode = "wb"
72 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020073 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020074 elif mode in ("x", "xb"):
75 mode = "xb"
76 mode_code = _MODE_WRITE
77 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020078 elif mode in ("a", "ab"):
79 mode = "ab"
80 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020081 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020082 else:
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020083 raise ValueError("Invalid mode: %r" % (mode,))
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020084
Berker Peksag8bdd4482016-10-02 20:07:06 +030085 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020086 self._fp = _builtin_open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020087 self._closefp = True
88 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020089 elif hasattr(filename, "read") or hasattr(filename, "write"):
90 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020091 self._mode = mode_code
92 else:
Berker Peksag8bdd4482016-10-02 20:07:06 +030093 raise TypeError("filename must be a str, bytes, file or PathLike object")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020094
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020095 if self._mode == _MODE_READ:
96 raw = _compression.DecompressReader(self._fp,
97 BZ2Decompressor, trailing_error=OSError)
98 self._buffer = io.BufferedReader(raw)
99 else:
100 self._pos = 0
101
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200102 def close(self):
103 """Flush and close the file.
104
105 May be called more than once without error. Once the file is
106 closed, any other operation on it will raise a ValueError.
107 """
108 with self._lock:
109 if self._mode == _MODE_CLOSED:
110 return
111 try:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200112 if self._mode == _MODE_READ:
113 self._buffer.close()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200114 elif self._mode == _MODE_WRITE:
115 self._fp.write(self._compressor.flush())
116 self._compressor = None
117 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200118 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200119 if self._closefp:
120 self._fp.close()
121 finally:
122 self._fp = None
123 self._closefp = False
124 self._mode = _MODE_CLOSED
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200125 self._buffer = None
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200126
127 @property
128 def closed(self):
129 """True if this file is closed."""
130 return self._mode == _MODE_CLOSED
131
132 def fileno(self):
133 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200134 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200135 return self._fp.fileno()
136
137 def seekable(self):
138 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200139 return self.readable() and self._buffer.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200140
141 def readable(self):
142 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200143 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200144 return self._mode == _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200145
146 def writable(self):
147 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200148 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200149 return self._mode == _MODE_WRITE
150
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200151 def peek(self, n=0):
152 """Return buffered data without advancing the file position.
153
154 Always returns at least one byte of data, unless at EOF.
155 The exact number of bytes returned is unspecified.
156 """
157 with self._lock:
158 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200159 # Relies on the undocumented fact that BufferedReader.peek()
160 # always returns at least one byte (except at EOF), independent
161 # of the value of n
162 return self._buffer.peek(n)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200163
164 def read(self, size=-1):
165 """Read up to size uncompressed bytes from the file.
166
167 If size is negative or omitted, read until EOF is reached.
168 Returns b'' if the file is already at EOF.
169 """
170 with self._lock:
171 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200172 return self._buffer.read(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200173
174 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200175 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200176 making multiple reads from the underlying stream. Reads up to a
177 buffer's worth of data if size is negative.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200178
179 Returns b'' if the file is at EOF.
180 """
181 with self._lock:
182 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200183 if size < 0:
184 size = io.DEFAULT_BUFFER_SIZE
185 return self._buffer.read1(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200186
187 def readinto(self, b):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200188 """Read bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200189
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200190 Returns the number of bytes read (0 for EOF).
191 """
192 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200193 self._check_can_read()
194 return self._buffer.readinto(b)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200195
196 def readline(self, size=-1):
197 """Read a line of uncompressed bytes from the file.
198
199 The terminating newline (if present) is retained. If size is
200 non-negative, no more than size bytes will be read (in which
201 case the line may be incomplete). Returns b'' if already at EOF.
202 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200203 if not isinstance(size, int):
204 if not hasattr(size, "__index__"):
205 raise TypeError("Integer argument expected")
206 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200207 with self._lock:
Nadeem Vawda138ad502012-10-01 23:04:11 +0200208 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200209 return self._buffer.readline(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200210
211 def readlines(self, size=-1):
212 """Read a list of lines of uncompressed bytes from the file.
213
214 size can be specified to control the number of lines read: no
215 further lines will be read once the total size of the lines read
216 so far equals or exceeds size.
217 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200218 if not isinstance(size, int):
219 if not hasattr(size, "__index__"):
220 raise TypeError("Integer argument expected")
221 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200222 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200223 self._check_can_read()
224 return self._buffer.readlines(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200225
226 def write(self, data):
227 """Write a byte string to the file.
228
229 Returns the number of uncompressed bytes written, which is
230 always len(data). Note that due to buffering, the file on disk
231 may not reflect the data written until close() is called.
232 """
233 with self._lock:
234 self._check_can_write()
235 compressed = self._compressor.compress(data)
236 self._fp.write(compressed)
237 self._pos += len(data)
238 return len(data)
239
240 def writelines(self, seq):
241 """Write a sequence of byte strings to the file.
242
243 Returns the number of uncompressed bytes written.
244 seq can be any iterable yielding byte strings.
245
246 Line separators are not added between the written byte strings.
247 """
248 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200249 return _compression.BaseStream.writelines(self, seq)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200250
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200251 def seek(self, offset, whence=io.SEEK_SET):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200252 """Change the file position.
253
254 The new position is specified by offset, relative to the
255 position indicated by whence. Values for whence are:
256
257 0: start of stream (default); offset must not be negative
258 1: current stream position
259 2: end of stream; offset must not be positive
260
261 Returns the new file position.
262
263 Note that seeking is emulated, so depending on the parameters,
264 this operation may be extremely slow.
265 """
266 with self._lock:
267 self._check_can_seek()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200268 return self._buffer.seek(offset, whence)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200269
270 def tell(self):
271 """Return the current file position."""
272 with self._lock:
273 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200274 if self._mode == _MODE_READ:
275 return self._buffer.tell()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200276 return self._pos
277
278
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200279def open(filename, mode="rb", compresslevel=9,
280 encoding=None, errors=None, newline=None):
281 """Open a bzip2-compressed file in binary or text mode.
282
Berker Peksag8bdd4482016-10-02 20:07:06 +0300283 The filename argument can be an actual filename (a str, bytes, or
284 PathLike object), or an existing file object to read from or write
285 to.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200286
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +0200287 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
288 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
289 The default mode is "rb", and the default compresslevel is 9.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200290
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200291 For binary mode, this function is equivalent to the BZ2File
292 constructor: BZ2File(filename, mode, compresslevel). In this case,
293 the encoding, errors and newline arguments must not be provided.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200294
295 For text mode, a BZ2File object is created, and wrapped in an
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200296 io.TextIOWrapper instance with the specified encoding, error
297 handling behavior, and line ending(s).
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200298
299 """
300 if "t" in mode:
301 if "b" in mode:
302 raise ValueError("Invalid mode: %r" % (mode,))
303 else:
304 if encoding is not None:
305 raise ValueError("Argument 'encoding' not supported in binary mode")
306 if errors is not None:
307 raise ValueError("Argument 'errors' not supported in binary mode")
308 if newline is not None:
309 raise ValueError("Argument 'newline' not supported in binary mode")
310
311 bz_mode = mode.replace("t", "")
312 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
313
314 if "t" in mode:
315 return io.TextIOWrapper(binary_file, encoding, errors, newline)
316 else:
317 return binary_file
318
319
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200320def compress(data, compresslevel=9):
321 """Compress a block of data.
322
323 compresslevel, if given, must be a number between 1 and 9.
324
325 For incremental compression, use a BZ2Compressor object instead.
326 """
327 comp = BZ2Compressor(compresslevel)
328 return comp.compress(data) + comp.flush()
329
330
331def decompress(data):
332 """Decompress a block of data.
333
334 For incremental decompression, use a BZ2Decompressor object instead.
335 """
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200336 results = []
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100337 while data:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200338 decomp = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100339 try:
340 res = decomp.decompress(data)
341 except OSError:
342 if results:
343 break # Leftover data is not a valid bzip2 stream; ignore it.
344 else:
345 raise # Error on the first iteration; bail out.
346 results.append(res)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200347 if not decomp.eof:
348 raise ValueError("Compressed data ended before the "
349 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200350 data = decomp.unused_data
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100351 return b"".join(results)