blob: 1de8f3ca7d4f14a4c0dbd8b074e211fdc4cedbc6 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020012import builtins
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020014import warnings
15
Nadeem Vawda72750a82012-01-18 01:57:14 +020016try:
17 from threading import RLock
18except ImportError:
19 from dummy_threading import RLock
20
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020021from _bz2 import BZ2Compressor, BZ2Decompressor
22
23
24_MODE_CLOSED = 0
25_MODE_READ = 1
26_MODE_READ_EOF = 2
27_MODE_WRITE = 3
28
29_BUFFER_SIZE = 8192
30
31
32class BZ2File(io.BufferedIOBase):
33
34 """A file object providing transparent bzip2 (de)compression.
35
36 A BZ2File can act as a wrapper for an existing file object, or refer
37 directly to a named file on disk.
38
39 Note that BZ2File provides a *binary* file interface - data read is
40 returned as bytes, and data to be written should be given as bytes.
41 """
42
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020043 def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020044 """Open a bzip2-compressed file.
45
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020046 If filename is a str or bytes object, is gives the name of the file to
47 be opened. Otherwise, it should be a file object, which will be used to
48 read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020049
Nadeem Vawda50cb9362012-06-04 23:31:22 +020050 mode can be 'r' for reading (default), 'w' for (over)writing, or 'a' for
51 appending. These can equivalently be given as 'rb', 'wb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020052
53 buffering is ignored. Its use is deprecated.
54
Nadeem Vawdacac89092012-02-04 13:08:11 +020055 If mode is 'w' or 'a', compresslevel can be a number between 1
56 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020057 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020058
59 If mode is 'r', the input file may be the concatenation of
60 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020061 """
62 # This lock must be recursive, so that BufferedIOBase's
63 # readline(), readlines() and writelines() don't deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020064 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 self._fp = None
66 self._closefp = False
67 self._mode = _MODE_CLOSED
68 self._pos = 0
69 self._size = -1
70
71 if buffering is not None:
72 warnings.warn("Use of 'buffering' argument is deprecated",
73 DeprecationWarning)
74
75 if not (1 <= compresslevel <= 9):
76 raise ValueError("compresslevel must be between 1 and 9")
77
78 if mode in ("", "r", "rb"):
79 mode = "rb"
80 mode_code = _MODE_READ
81 self._decompressor = BZ2Decompressor()
Nadeem Vawda6c573182012-09-30 03:57:33 +020082 self._buffer = b""
83 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020084 elif mode in ("w", "wb"):
85 mode = "wb"
86 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020087 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020088 elif mode in ("a", "ab"):
89 mode = "ab"
90 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020091 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020092 else:
93 raise ValueError("Invalid mode: {!r}".format(mode))
94
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020095 if isinstance(filename, (str, bytes)):
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020096 self._fp = builtins.open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020097 self._closefp = True
98 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020099 elif hasattr(filename, "read") or hasattr(filename, "write"):
100 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200101 self._mode = mode_code
102 else:
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +0200103 raise TypeError("filename must be a str or bytes object, or a file")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200104
105 def close(self):
106 """Flush and close the file.
107
108 May be called more than once without error. Once the file is
109 closed, any other operation on it will raise a ValueError.
110 """
111 with self._lock:
112 if self._mode == _MODE_CLOSED:
113 return
114 try:
115 if self._mode in (_MODE_READ, _MODE_READ_EOF):
116 self._decompressor = None
117 elif self._mode == _MODE_WRITE:
118 self._fp.write(self._compressor.flush())
119 self._compressor = None
120 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200121 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200122 if self._closefp:
123 self._fp.close()
124 finally:
125 self._fp = None
126 self._closefp = False
127 self._mode = _MODE_CLOSED
Nadeem Vawda6c573182012-09-30 03:57:33 +0200128 self._buffer = b""
129 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200130
131 @property
132 def closed(self):
133 """True if this file is closed."""
134 return self._mode == _MODE_CLOSED
135
136 def fileno(self):
137 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200138 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200139 return self._fp.fileno()
140
141 def seekable(self):
142 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200143 return self.readable() and self._fp.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200144
145 def readable(self):
146 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200147 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200148 return self._mode in (_MODE_READ, _MODE_READ_EOF)
149
150 def writable(self):
151 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200152 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200153 return self._mode == _MODE_WRITE
154
155 # Mode-checking helper functions.
156
157 def _check_not_closed(self):
158 if self.closed:
159 raise ValueError("I/O operation on closed file")
160
161 def _check_can_read(self):
Nadeem Vawdab7a0bfe2012-09-30 23:58:01 +0200162 if self._mode not in (_MODE_READ, _MODE_READ_EOF):
Nadeem Vawda452add02012-10-01 23:02:50 +0200163 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200164 raise io.UnsupportedOperation("File not open for reading")
165
166 def _check_can_write(self):
Nadeem Vawdab7a0bfe2012-09-30 23:58:01 +0200167 if self._mode != _MODE_WRITE:
Nadeem Vawda452add02012-10-01 23:02:50 +0200168 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200169 raise io.UnsupportedOperation("File not open for writing")
170
171 def _check_can_seek(self):
Nadeem Vawdab7a0bfe2012-09-30 23:58:01 +0200172 if self._mode not in (_MODE_READ, _MODE_READ_EOF):
Nadeem Vawda452add02012-10-01 23:02:50 +0200173 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200174 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200175 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200176 if not self._fp.seekable():
177 raise io.UnsupportedOperation("The underlying file object "
178 "does not support seeking")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200179
180 # Fill the readahead buffer if it is empty. Returns False on EOF.
181 def _fill_buffer(self):
Nadeem Vawda6c573182012-09-30 03:57:33 +0200182 if self._mode == _MODE_READ_EOF:
183 return False
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200184 # Depending on the input data, our call to the decompressor may not
185 # return any data. In this case, try again after reading another block.
Nadeem Vawda6c573182012-09-30 03:57:33 +0200186 while self._buffer_offset == len(self._buffer):
187 rawblock = (self._decompressor.unused_data or
188 self._fp.read(_BUFFER_SIZE))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200189
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200190 if not rawblock:
191 if self._decompressor.eof:
192 self._mode = _MODE_READ_EOF
193 self._size = self._pos
194 return False
195 else:
196 raise EOFError("Compressed file ended before the "
197 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200198
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200199 # Continue to next stream.
200 if self._decompressor.eof:
201 self._decompressor = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100202 try:
203 self._buffer = self._decompressor.decompress(rawblock)
204 except OSError:
205 # Trailing data isn't a valid bzip2 stream. We're done here.
206 self._mode = _MODE_READ_EOF
207 self._size = self._pos
208 return False
209 else:
210 self._buffer = self._decompressor.decompress(rawblock)
Nadeem Vawda6c573182012-09-30 03:57:33 +0200211 self._buffer_offset = 0
212 return True
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200213
214 # Read data until EOF.
215 # If return_data is false, consume the data without returning it.
216 def _read_all(self, return_data=True):
Nadeem Vawda6c573182012-09-30 03:57:33 +0200217 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
218 self._buffer = self._buffer[self._buffer_offset:]
219 self._buffer_offset = 0
220
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200221 blocks = []
222 while self._fill_buffer():
223 if return_data:
224 blocks.append(self._buffer)
225 self._pos += len(self._buffer)
Nadeem Vawda6c573182012-09-30 03:57:33 +0200226 self._buffer = b""
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200227 if return_data:
228 return b"".join(blocks)
229
230 # Read a block of up to n bytes.
231 # If return_data is false, consume the data without returning it.
232 def _read_block(self, n, return_data=True):
Nadeem Vawda6c573182012-09-30 03:57:33 +0200233 # If we have enough data buffered, return immediately.
234 end = self._buffer_offset + n
235 if end <= len(self._buffer):
236 data = self._buffer[self._buffer_offset : end]
237 self._buffer_offset = end
238 self._pos += len(data)
Nadeem Vawda9e2a28e2012-09-30 13:41:29 +0200239 return data if return_data else None
Nadeem Vawda6c573182012-09-30 03:57:33 +0200240
241 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
242 self._buffer = self._buffer[self._buffer_offset:]
243 self._buffer_offset = 0
244
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200245 blocks = []
246 while n > 0 and self._fill_buffer():
247 if n < len(self._buffer):
248 data = self._buffer[:n]
Nadeem Vawda6c573182012-09-30 03:57:33 +0200249 self._buffer_offset = n
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200250 else:
251 data = self._buffer
Nadeem Vawda6c573182012-09-30 03:57:33 +0200252 self._buffer = b""
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200253 if return_data:
254 blocks.append(data)
255 self._pos += len(data)
256 n -= len(data)
257 if return_data:
258 return b"".join(blocks)
259
260 def peek(self, n=0):
261 """Return buffered data without advancing the file position.
262
263 Always returns at least one byte of data, unless at EOF.
264 The exact number of bytes returned is unspecified.
265 """
266 with self._lock:
267 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200268 if not self._fill_buffer():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200269 return b""
Nadeem Vawda6c573182012-09-30 03:57:33 +0200270 return self._buffer[self._buffer_offset:]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200271
272 def read(self, size=-1):
273 """Read up to size uncompressed bytes from the file.
274
275 If size is negative or omitted, read until EOF is reached.
276 Returns b'' if the file is already at EOF.
277 """
278 with self._lock:
279 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200280 if size == 0:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200281 return b""
282 elif size < 0:
283 return self._read_all()
284 else:
285 return self._read_block(size)
286
287 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200288 """Read up to size uncompressed bytes, while trying to avoid
289 making multiple reads from the underlying stream.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200290
291 Returns b'' if the file is at EOF.
292 """
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200293 # Usually, read1() calls _fp.read() at most once. However, sometimes
294 # this does not give enough data for the decompressor to make progress.
295 # In this case we make multiple reads, to avoid returning b"".
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200296 with self._lock:
297 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200298 if (size == 0 or
299 # Only call _fill_buffer() if the buffer is actually empty.
300 # This gives a significant speedup if *size* is small.
301 (self._buffer_offset == len(self._buffer) and not self._fill_buffer())):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200302 return b""
Nadeem Vawda6c573182012-09-30 03:57:33 +0200303 if size > 0:
304 data = self._buffer[self._buffer_offset :
305 self._buffer_offset + size]
306 self._buffer_offset += len(data)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200307 else:
Nadeem Vawda6c573182012-09-30 03:57:33 +0200308 data = self._buffer[self._buffer_offset:]
309 self._buffer = b""
310 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200311 self._pos += len(data)
312 return data
313
314 def readinto(self, b):
315 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200316
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200317 Returns the number of bytes read (0 for EOF).
318 """
319 with self._lock:
320 return io.BufferedIOBase.readinto(self, b)
321
322 def readline(self, size=-1):
323 """Read a line of uncompressed bytes from the file.
324
325 The terminating newline (if present) is retained. If size is
326 non-negative, no more than size bytes will be read (in which
327 case the line may be incomplete). Returns b'' if already at EOF.
328 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200329 if not isinstance(size, int):
330 if not hasattr(size, "__index__"):
331 raise TypeError("Integer argument expected")
332 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200333 with self._lock:
Nadeem Vawda138ad502012-10-01 23:04:11 +0200334 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200335 # Shortcut for the common case - the whole line is in the buffer.
336 if size < 0:
337 end = self._buffer.find(b"\n", self._buffer_offset) + 1
338 if end > 0:
339 line = self._buffer[self._buffer_offset : end]
340 self._buffer_offset = end
341 self._pos += len(line)
342 return line
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200343 return io.BufferedIOBase.readline(self, size)
344
345 def readlines(self, size=-1):
346 """Read a list of lines of uncompressed bytes from the file.
347
348 size can be specified to control the number of lines read: no
349 further lines will be read once the total size of the lines read
350 so far equals or exceeds size.
351 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200352 if not isinstance(size, int):
353 if not hasattr(size, "__index__"):
354 raise TypeError("Integer argument expected")
355 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200356 with self._lock:
357 return io.BufferedIOBase.readlines(self, size)
358
359 def write(self, data):
360 """Write a byte string to the file.
361
362 Returns the number of uncompressed bytes written, which is
363 always len(data). Note that due to buffering, the file on disk
364 may not reflect the data written until close() is called.
365 """
366 with self._lock:
367 self._check_can_write()
368 compressed = self._compressor.compress(data)
369 self._fp.write(compressed)
370 self._pos += len(data)
371 return len(data)
372
373 def writelines(self, seq):
374 """Write a sequence of byte strings to the file.
375
376 Returns the number of uncompressed bytes written.
377 seq can be any iterable yielding byte strings.
378
379 Line separators are not added between the written byte strings.
380 """
381 with self._lock:
382 return io.BufferedIOBase.writelines(self, seq)
383
384 # Rewind the file to the beginning of the data stream.
385 def _rewind(self):
386 self._fp.seek(0, 0)
387 self._mode = _MODE_READ
388 self._pos = 0
389 self._decompressor = BZ2Decompressor()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200390 self._buffer = b""
391 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200392
393 def seek(self, offset, whence=0):
394 """Change the file position.
395
396 The new position is specified by offset, relative to the
397 position indicated by whence. Values for whence are:
398
399 0: start of stream (default); offset must not be negative
400 1: current stream position
401 2: end of stream; offset must not be positive
402
403 Returns the new file position.
404
405 Note that seeking is emulated, so depending on the parameters,
406 this operation may be extremely slow.
407 """
408 with self._lock:
409 self._check_can_seek()
410
411 # Recalculate offset as an absolute file position.
412 if whence == 0:
413 pass
414 elif whence == 1:
415 offset = self._pos + offset
416 elif whence == 2:
417 # Seeking relative to EOF - we need to know the file's size.
418 if self._size < 0:
419 self._read_all(return_data=False)
420 offset = self._size + offset
421 else:
422 raise ValueError("Invalid value for whence: {}".format(whence))
423
424 # Make it so that offset is the number of bytes to skip forward.
425 if offset < self._pos:
426 self._rewind()
427 else:
428 offset -= self._pos
429
430 # Read and discard data until we reach the desired position.
Nadeem Vawda6c573182012-09-30 03:57:33 +0200431 self._read_block(offset, return_data=False)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200432
433 return self._pos
434
435 def tell(self):
436 """Return the current file position."""
437 with self._lock:
438 self._check_not_closed()
439 return self._pos
440
441
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200442def open(filename, mode="rb", compresslevel=9,
443 encoding=None, errors=None, newline=None):
444 """Open a bzip2-compressed file in binary or text mode.
445
446 The filename argument can be an actual filename (a str or bytes object), or
447 an existing file object to read from or write to.
448
449 The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode,
450 or "rt", "wt" or "at" for text mode. The default mode is "rb", and the
451 default compresslevel is 9.
452
453 For binary mode, this function is equivalent to the BZ2File constructor:
454 BZ2File(filename, mode, compresslevel). In this case, the encoding, errors
455 and newline arguments must not be provided.
456
457 For text mode, a BZ2File object is created, and wrapped in an
458 io.TextIOWrapper instance with the specified encoding, error handling
459 behavior, and line ending(s).
460
461 """
462 if "t" in mode:
463 if "b" in mode:
464 raise ValueError("Invalid mode: %r" % (mode,))
465 else:
466 if encoding is not None:
467 raise ValueError("Argument 'encoding' not supported in binary mode")
468 if errors is not None:
469 raise ValueError("Argument 'errors' not supported in binary mode")
470 if newline is not None:
471 raise ValueError("Argument 'newline' not supported in binary mode")
472
473 bz_mode = mode.replace("t", "")
474 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
475
476 if "t" in mode:
477 return io.TextIOWrapper(binary_file, encoding, errors, newline)
478 else:
479 return binary_file
480
481
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200482def compress(data, compresslevel=9):
483 """Compress a block of data.
484
485 compresslevel, if given, must be a number between 1 and 9.
486
487 For incremental compression, use a BZ2Compressor object instead.
488 """
489 comp = BZ2Compressor(compresslevel)
490 return comp.compress(data) + comp.flush()
491
492
493def decompress(data):
494 """Decompress a block of data.
495
496 For incremental decompression, use a BZ2Decompressor object instead.
497 """
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200498 results = []
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100499 while data:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200500 decomp = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100501 try:
502 res = decomp.decompress(data)
503 except OSError:
504 if results:
505 break # Leftover data is not a valid bzip2 stream; ignore it.
506 else:
507 raise # Error on the first iteration; bail out.
508 results.append(res)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200509 if not decomp.eof:
510 raise ValueError("Compressed data ended before the "
511 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200512 data = decomp.unused_data
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100513 return b"".join(results)