blob: 37918a8b536760560b1c1e88967f10d47706eef5 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020012import builtins
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020014import warnings
15
Nadeem Vawda72750a82012-01-18 01:57:14 +020016try:
17 from threading import RLock
18except ImportError:
19 from dummy_threading import RLock
20
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020021from _bz2 import BZ2Compressor, BZ2Decompressor
22
23
24_MODE_CLOSED = 0
25_MODE_READ = 1
26_MODE_READ_EOF = 2
27_MODE_WRITE = 3
28
29_BUFFER_SIZE = 8192
30
31
32class BZ2File(io.BufferedIOBase):
33
34 """A file object providing transparent bzip2 (de)compression.
35
36 A BZ2File can act as a wrapper for an existing file object, or refer
37 directly to a named file on disk.
38
39 Note that BZ2File provides a *binary* file interface - data read is
40 returned as bytes, and data to be written should be given as bytes.
41 """
42
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020043 def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020044 """Open a bzip2-compressed file.
45
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020046 If filename is a str or bytes object, is gives the name of the file to
47 be opened. Otherwise, it should be a file object, which will be used to
48 read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020049
Nadeem Vawda50cb9362012-06-04 23:31:22 +020050 mode can be 'r' for reading (default), 'w' for (over)writing, or 'a' for
51 appending. These can equivalently be given as 'rb', 'wb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020052
53 buffering is ignored. Its use is deprecated.
54
Nadeem Vawdacac89092012-02-04 13:08:11 +020055 If mode is 'w' or 'a', compresslevel can be a number between 1
56 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020057 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020058
59 If mode is 'r', the input file may be the concatenation of
60 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020061 """
62 # This lock must be recursive, so that BufferedIOBase's
63 # readline(), readlines() and writelines() don't deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020064 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 self._fp = None
66 self._closefp = False
67 self._mode = _MODE_CLOSED
68 self._pos = 0
69 self._size = -1
70
71 if buffering is not None:
72 warnings.warn("Use of 'buffering' argument is deprecated",
73 DeprecationWarning)
74
75 if not (1 <= compresslevel <= 9):
76 raise ValueError("compresslevel must be between 1 and 9")
77
78 if mode in ("", "r", "rb"):
79 mode = "rb"
80 mode_code = _MODE_READ
81 self._decompressor = BZ2Decompressor()
Nadeem Vawda6c573182012-09-30 03:57:33 +020082 self._buffer = b""
83 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020084 elif mode in ("w", "wb"):
85 mode = "wb"
86 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020087 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020088 elif mode in ("a", "ab"):
89 mode = "ab"
90 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020091 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020092 else:
93 raise ValueError("Invalid mode: {!r}".format(mode))
94
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020095 if isinstance(filename, (str, bytes)):
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020096 self._fp = builtins.open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020097 self._closefp = True
98 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020099 elif hasattr(filename, "read") or hasattr(filename, "write"):
100 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200101 self._mode = mode_code
102 else:
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +0200103 raise TypeError("filename must be a str or bytes object, or a file")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200104
105 def close(self):
106 """Flush and close the file.
107
108 May be called more than once without error. Once the file is
109 closed, any other operation on it will raise a ValueError.
110 """
111 with self._lock:
112 if self._mode == _MODE_CLOSED:
113 return
114 try:
115 if self._mode in (_MODE_READ, _MODE_READ_EOF):
116 self._decompressor = None
117 elif self._mode == _MODE_WRITE:
118 self._fp.write(self._compressor.flush())
119 self._compressor = None
120 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200121 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200122 if self._closefp:
123 self._fp.close()
124 finally:
125 self._fp = None
126 self._closefp = False
127 self._mode = _MODE_CLOSED
Nadeem Vawda6c573182012-09-30 03:57:33 +0200128 self._buffer = b""
129 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200130
131 @property
132 def closed(self):
133 """True if this file is closed."""
134 return self._mode == _MODE_CLOSED
135
136 def fileno(self):
137 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200138 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200139 return self._fp.fileno()
140
141 def seekable(self):
142 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200143 return self.readable() and self._fp.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200144
145 def readable(self):
146 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200147 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200148 return self._mode in (_MODE_READ, _MODE_READ_EOF)
149
150 def writable(self):
151 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200152 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200153 return self._mode == _MODE_WRITE
154
155 # Mode-checking helper functions.
156
157 def _check_not_closed(self):
158 if self.closed:
159 raise ValueError("I/O operation on closed file")
160
161 def _check_can_read(self):
Nadeem Vawdab7a0bfe2012-09-30 23:58:01 +0200162 if self.closed:
163 raise ValueError("I/O operation on closed file")
164 if self._mode not in (_MODE_READ, _MODE_READ_EOF):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200165 raise io.UnsupportedOperation("File not open for reading")
166
167 def _check_can_write(self):
Nadeem Vawdab7a0bfe2012-09-30 23:58:01 +0200168 if self.closed:
169 raise ValueError("I/O operation on closed file")
170 if self._mode != _MODE_WRITE:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200171 raise io.UnsupportedOperation("File not open for writing")
172
173 def _check_can_seek(self):
Nadeem Vawdab7a0bfe2012-09-30 23:58:01 +0200174 if self.closed:
175 raise ValueError("I/O operation on closed file")
176 if self._mode not in (_MODE_READ, _MODE_READ_EOF):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200177 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200178 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200179 if not self._fp.seekable():
180 raise io.UnsupportedOperation("The underlying file object "
181 "does not support seeking")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200182
183 # Fill the readahead buffer if it is empty. Returns False on EOF.
184 def _fill_buffer(self):
Nadeem Vawda6c573182012-09-30 03:57:33 +0200185 if self._mode == _MODE_READ_EOF:
186 return False
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200187 # Depending on the input data, our call to the decompressor may not
188 # return any data. In this case, try again after reading another block.
Nadeem Vawda6c573182012-09-30 03:57:33 +0200189 while self._buffer_offset == len(self._buffer):
190 rawblock = (self._decompressor.unused_data or
191 self._fp.read(_BUFFER_SIZE))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200192
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200193 if not rawblock:
194 if self._decompressor.eof:
195 self._mode = _MODE_READ_EOF
196 self._size = self._pos
197 return False
198 else:
199 raise EOFError("Compressed file ended before the "
200 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200201
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200202 # Continue to next stream.
203 if self._decompressor.eof:
204 self._decompressor = BZ2Decompressor()
205
206 self._buffer = self._decompressor.decompress(rawblock)
Nadeem Vawda6c573182012-09-30 03:57:33 +0200207 self._buffer_offset = 0
208 return True
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200209
210 # Read data until EOF.
211 # If return_data is false, consume the data without returning it.
212 def _read_all(self, return_data=True):
Nadeem Vawda6c573182012-09-30 03:57:33 +0200213 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
214 self._buffer = self._buffer[self._buffer_offset:]
215 self._buffer_offset = 0
216
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200217 blocks = []
218 while self._fill_buffer():
219 if return_data:
220 blocks.append(self._buffer)
221 self._pos += len(self._buffer)
Nadeem Vawda6c573182012-09-30 03:57:33 +0200222 self._buffer = b""
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200223 if return_data:
224 return b"".join(blocks)
225
226 # Read a block of up to n bytes.
227 # If return_data is false, consume the data without returning it.
228 def _read_block(self, n, return_data=True):
Nadeem Vawda6c573182012-09-30 03:57:33 +0200229 # If we have enough data buffered, return immediately.
230 end = self._buffer_offset + n
231 if end <= len(self._buffer):
232 data = self._buffer[self._buffer_offset : end]
233 self._buffer_offset = end
234 self._pos += len(data)
Nadeem Vawda9e2a28e2012-09-30 13:41:29 +0200235 return data if return_data else None
Nadeem Vawda6c573182012-09-30 03:57:33 +0200236
237 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
238 self._buffer = self._buffer[self._buffer_offset:]
239 self._buffer_offset = 0
240
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200241 blocks = []
242 while n > 0 and self._fill_buffer():
243 if n < len(self._buffer):
244 data = self._buffer[:n]
Nadeem Vawda6c573182012-09-30 03:57:33 +0200245 self._buffer_offset = n
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200246 else:
247 data = self._buffer
Nadeem Vawda6c573182012-09-30 03:57:33 +0200248 self._buffer = b""
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200249 if return_data:
250 blocks.append(data)
251 self._pos += len(data)
252 n -= len(data)
253 if return_data:
254 return b"".join(blocks)
255
256 def peek(self, n=0):
257 """Return buffered data without advancing the file position.
258
259 Always returns at least one byte of data, unless at EOF.
260 The exact number of bytes returned is unspecified.
261 """
262 with self._lock:
263 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200264 if not self._fill_buffer():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200265 return b""
Nadeem Vawda6c573182012-09-30 03:57:33 +0200266 return self._buffer[self._buffer_offset:]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200267
268 def read(self, size=-1):
269 """Read up to size uncompressed bytes from the file.
270
271 If size is negative or omitted, read until EOF is reached.
272 Returns b'' if the file is already at EOF.
273 """
274 with self._lock:
275 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200276 if size == 0:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200277 return b""
278 elif size < 0:
279 return self._read_all()
280 else:
281 return self._read_block(size)
282
283 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200284 """Read up to size uncompressed bytes, while trying to avoid
285 making multiple reads from the underlying stream.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200286
287 Returns b'' if the file is at EOF.
288 """
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200289 # Usually, read1() calls _fp.read() at most once. However, sometimes
290 # this does not give enough data for the decompressor to make progress.
291 # In this case we make multiple reads, to avoid returning b"".
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200292 with self._lock:
293 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200294 if (size == 0 or
295 # Only call _fill_buffer() if the buffer is actually empty.
296 # This gives a significant speedup if *size* is small.
297 (self._buffer_offset == len(self._buffer) and not self._fill_buffer())):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200298 return b""
Nadeem Vawda6c573182012-09-30 03:57:33 +0200299 if size > 0:
300 data = self._buffer[self._buffer_offset :
301 self._buffer_offset + size]
302 self._buffer_offset += len(data)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200303 else:
Nadeem Vawda6c573182012-09-30 03:57:33 +0200304 data = self._buffer[self._buffer_offset:]
305 self._buffer = b""
306 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200307 self._pos += len(data)
308 return data
309
310 def readinto(self, b):
311 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200312
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200313 Returns the number of bytes read (0 for EOF).
314 """
315 with self._lock:
316 return io.BufferedIOBase.readinto(self, b)
317
318 def readline(self, size=-1):
319 """Read a line of uncompressed bytes from the file.
320
321 The terminating newline (if present) is retained. If size is
322 non-negative, no more than size bytes will be read (in which
323 case the line may be incomplete). Returns b'' if already at EOF.
324 """
325 if not hasattr(size, "__index__"):
326 raise TypeError("Integer argument expected")
327 size = size.__index__()
328 with self._lock:
Nadeem Vawda6c573182012-09-30 03:57:33 +0200329 # Shortcut for the common case - the whole line is in the buffer.
330 if size < 0:
331 end = self._buffer.find(b"\n", self._buffer_offset) + 1
332 if end > 0:
333 line = self._buffer[self._buffer_offset : end]
334 self._buffer_offset = end
335 self._pos += len(line)
336 return line
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200337 return io.BufferedIOBase.readline(self, size)
338
339 def readlines(self, size=-1):
340 """Read a list of lines of uncompressed bytes from the file.
341
342 size can be specified to control the number of lines read: no
343 further lines will be read once the total size of the lines read
344 so far equals or exceeds size.
345 """
346 if not hasattr(size, "__index__"):
347 raise TypeError("Integer argument expected")
348 size = size.__index__()
349 with self._lock:
350 return io.BufferedIOBase.readlines(self, size)
351
352 def write(self, data):
353 """Write a byte string to the file.
354
355 Returns the number of uncompressed bytes written, which is
356 always len(data). Note that due to buffering, the file on disk
357 may not reflect the data written until close() is called.
358 """
359 with self._lock:
360 self._check_can_write()
361 compressed = self._compressor.compress(data)
362 self._fp.write(compressed)
363 self._pos += len(data)
364 return len(data)
365
366 def writelines(self, seq):
367 """Write a sequence of byte strings to the file.
368
369 Returns the number of uncompressed bytes written.
370 seq can be any iterable yielding byte strings.
371
372 Line separators are not added between the written byte strings.
373 """
374 with self._lock:
375 return io.BufferedIOBase.writelines(self, seq)
376
377 # Rewind the file to the beginning of the data stream.
378 def _rewind(self):
379 self._fp.seek(0, 0)
380 self._mode = _MODE_READ
381 self._pos = 0
382 self._decompressor = BZ2Decompressor()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200383 self._buffer = b""
384 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200385
386 def seek(self, offset, whence=0):
387 """Change the file position.
388
389 The new position is specified by offset, relative to the
390 position indicated by whence. Values for whence are:
391
392 0: start of stream (default); offset must not be negative
393 1: current stream position
394 2: end of stream; offset must not be positive
395
396 Returns the new file position.
397
398 Note that seeking is emulated, so depending on the parameters,
399 this operation may be extremely slow.
400 """
401 with self._lock:
402 self._check_can_seek()
403
404 # Recalculate offset as an absolute file position.
405 if whence == 0:
406 pass
407 elif whence == 1:
408 offset = self._pos + offset
409 elif whence == 2:
410 # Seeking relative to EOF - we need to know the file's size.
411 if self._size < 0:
412 self._read_all(return_data=False)
413 offset = self._size + offset
414 else:
415 raise ValueError("Invalid value for whence: {}".format(whence))
416
417 # Make it so that offset is the number of bytes to skip forward.
418 if offset < self._pos:
419 self._rewind()
420 else:
421 offset -= self._pos
422
423 # Read and discard data until we reach the desired position.
Nadeem Vawda6c573182012-09-30 03:57:33 +0200424 self._read_block(offset, return_data=False)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200425
426 return self._pos
427
428 def tell(self):
429 """Return the current file position."""
430 with self._lock:
431 self._check_not_closed()
432 return self._pos
433
434
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200435def open(filename, mode="rb", compresslevel=9,
436 encoding=None, errors=None, newline=None):
437 """Open a bzip2-compressed file in binary or text mode.
438
439 The filename argument can be an actual filename (a str or bytes object), or
440 an existing file object to read from or write to.
441
442 The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode,
443 or "rt", "wt" or "at" for text mode. The default mode is "rb", and the
444 default compresslevel is 9.
445
446 For binary mode, this function is equivalent to the BZ2File constructor:
447 BZ2File(filename, mode, compresslevel). In this case, the encoding, errors
448 and newline arguments must not be provided.
449
450 For text mode, a BZ2File object is created, and wrapped in an
451 io.TextIOWrapper instance with the specified encoding, error handling
452 behavior, and line ending(s).
453
454 """
455 if "t" in mode:
456 if "b" in mode:
457 raise ValueError("Invalid mode: %r" % (mode,))
458 else:
459 if encoding is not None:
460 raise ValueError("Argument 'encoding' not supported in binary mode")
461 if errors is not None:
462 raise ValueError("Argument 'errors' not supported in binary mode")
463 if newline is not None:
464 raise ValueError("Argument 'newline' not supported in binary mode")
465
466 bz_mode = mode.replace("t", "")
467 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
468
469 if "t" in mode:
470 return io.TextIOWrapper(binary_file, encoding, errors, newline)
471 else:
472 return binary_file
473
474
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200475def compress(data, compresslevel=9):
476 """Compress a block of data.
477
478 compresslevel, if given, must be a number between 1 and 9.
479
480 For incremental compression, use a BZ2Compressor object instead.
481 """
482 comp = BZ2Compressor(compresslevel)
483 return comp.compress(data) + comp.flush()
484
485
486def decompress(data):
487 """Decompress a block of data.
488
489 For incremental decompression, use a BZ2Decompressor object instead.
490 """
491 if len(data) == 0:
492 return b""
Nadeem Vawda55b43382011-05-27 01:52:15 +0200493
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200494 results = []
Nadeem Vawda55b43382011-05-27 01:52:15 +0200495 while True:
496 decomp = BZ2Decompressor()
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200497 results.append(decomp.decompress(data))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200498 if not decomp.eof:
499 raise ValueError("Compressed data ended before the "
500 "end-of-stream marker was reached")
501 if not decomp.unused_data:
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200502 return b"".join(results)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200503 # There is unused data left over. Proceed to next stream.
504 data = decomp.unused_data