blob: fe4118f9a7a28133bad4beaf2db0a1a2b36621d1 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020012import builtins
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020014import warnings
15
Nadeem Vawda72750a82012-01-18 01:57:14 +020016try:
17 from threading import RLock
18except ImportError:
19 from dummy_threading import RLock
20
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020021from _bz2 import BZ2Compressor, BZ2Decompressor
22
23
24_MODE_CLOSED = 0
25_MODE_READ = 1
26_MODE_READ_EOF = 2
27_MODE_WRITE = 3
28
29_BUFFER_SIZE = 8192
30
31
32class BZ2File(io.BufferedIOBase):
33
34 """A file object providing transparent bzip2 (de)compression.
35
36 A BZ2File can act as a wrapper for an existing file object, or refer
37 directly to a named file on disk.
38
39 Note that BZ2File provides a *binary* file interface - data read is
40 returned as bytes, and data to be written should be given as bytes.
41 """
42
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020043 def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020044 """Open a bzip2-compressed file.
45
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020046 If filename is a str or bytes object, is gives the name of the file to
47 be opened. Otherwise, it should be a file object, which will be used to
48 read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020049
Nadeem Vawda50cb9362012-06-04 23:31:22 +020050 mode can be 'r' for reading (default), 'w' for (over)writing, or 'a' for
51 appending. These can equivalently be given as 'rb', 'wb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020052
53 buffering is ignored. Its use is deprecated.
54
Nadeem Vawdacac89092012-02-04 13:08:11 +020055 If mode is 'w' or 'a', compresslevel can be a number between 1
56 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020057 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020058
59 If mode is 'r', the input file may be the concatenation of
60 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020061 """
62 # This lock must be recursive, so that BufferedIOBase's
63 # readline(), readlines() and writelines() don't deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020064 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 self._fp = None
66 self._closefp = False
67 self._mode = _MODE_CLOSED
68 self._pos = 0
69 self._size = -1
70
71 if buffering is not None:
72 warnings.warn("Use of 'buffering' argument is deprecated",
73 DeprecationWarning)
74
75 if not (1 <= compresslevel <= 9):
76 raise ValueError("compresslevel must be between 1 and 9")
77
78 if mode in ("", "r", "rb"):
79 mode = "rb"
80 mode_code = _MODE_READ
81 self._decompressor = BZ2Decompressor()
Nadeem Vawda6c573182012-09-30 03:57:33 +020082 self._buffer = b""
83 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020084 elif mode in ("w", "wb"):
85 mode = "wb"
86 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020087 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020088 elif mode in ("a", "ab"):
89 mode = "ab"
90 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020091 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020092 else:
93 raise ValueError("Invalid mode: {!r}".format(mode))
94
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020095 if isinstance(filename, (str, bytes)):
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020096 self._fp = builtins.open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020097 self._closefp = True
98 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020099 elif hasattr(filename, "read") or hasattr(filename, "write"):
100 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200101 self._mode = mode_code
102 else:
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +0200103 raise TypeError("filename must be a str or bytes object, or a file")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200104
105 def close(self):
106 """Flush and close the file.
107
108 May be called more than once without error. Once the file is
109 closed, any other operation on it will raise a ValueError.
110 """
111 with self._lock:
112 if self._mode == _MODE_CLOSED:
113 return
114 try:
115 if self._mode in (_MODE_READ, _MODE_READ_EOF):
116 self._decompressor = None
117 elif self._mode == _MODE_WRITE:
118 self._fp.write(self._compressor.flush())
119 self._compressor = None
120 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200121 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200122 if self._closefp:
123 self._fp.close()
124 finally:
125 self._fp = None
126 self._closefp = False
127 self._mode = _MODE_CLOSED
Nadeem Vawda6c573182012-09-30 03:57:33 +0200128 self._buffer = b""
129 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200130
131 @property
132 def closed(self):
133 """True if this file is closed."""
134 return self._mode == _MODE_CLOSED
135
136 def fileno(self):
137 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200138 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200139 return self._fp.fileno()
140
141 def seekable(self):
142 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200143 return self.readable() and self._fp.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200144
145 def readable(self):
146 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200147 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200148 return self._mode in (_MODE_READ, _MODE_READ_EOF)
149
150 def writable(self):
151 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200152 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200153 return self._mode == _MODE_WRITE
154
155 # Mode-checking helper functions.
156
157 def _check_not_closed(self):
158 if self.closed:
159 raise ValueError("I/O operation on closed file")
160
161 def _check_can_read(self):
162 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200163 raise io.UnsupportedOperation("File not open for reading")
164
165 def _check_can_write(self):
166 if not self.writable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200167 raise io.UnsupportedOperation("File not open for writing")
168
169 def _check_can_seek(self):
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200170 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200171 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200172 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200173 if not self._fp.seekable():
174 raise io.UnsupportedOperation("The underlying file object "
175 "does not support seeking")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200176
177 # Fill the readahead buffer if it is empty. Returns False on EOF.
178 def _fill_buffer(self):
Nadeem Vawda6c573182012-09-30 03:57:33 +0200179 if self._mode == _MODE_READ_EOF:
180 return False
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200181 # Depending on the input data, our call to the decompressor may not
182 # return any data. In this case, try again after reading another block.
Nadeem Vawda6c573182012-09-30 03:57:33 +0200183 while self._buffer_offset == len(self._buffer):
184 rawblock = (self._decompressor.unused_data or
185 self._fp.read(_BUFFER_SIZE))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200186
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200187 if not rawblock:
188 if self._decompressor.eof:
189 self._mode = _MODE_READ_EOF
190 self._size = self._pos
191 return False
192 else:
193 raise EOFError("Compressed file ended before the "
194 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200195
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200196 # Continue to next stream.
197 if self._decompressor.eof:
198 self._decompressor = BZ2Decompressor()
199
200 self._buffer = self._decompressor.decompress(rawblock)
Nadeem Vawda6c573182012-09-30 03:57:33 +0200201 self._buffer_offset = 0
202 return True
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200203
204 # Read data until EOF.
205 # If return_data is false, consume the data without returning it.
206 def _read_all(self, return_data=True):
Nadeem Vawda6c573182012-09-30 03:57:33 +0200207 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
208 self._buffer = self._buffer[self._buffer_offset:]
209 self._buffer_offset = 0
210
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200211 blocks = []
212 while self._fill_buffer():
213 if return_data:
214 blocks.append(self._buffer)
215 self._pos += len(self._buffer)
Nadeem Vawda6c573182012-09-30 03:57:33 +0200216 self._buffer = b""
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200217 if return_data:
218 return b"".join(blocks)
219
220 # Read a block of up to n bytes.
221 # If return_data is false, consume the data without returning it.
222 def _read_block(self, n, return_data=True):
Nadeem Vawda6c573182012-09-30 03:57:33 +0200223 # If we have enough data buffered, return immediately.
224 end = self._buffer_offset + n
225 if end <= len(self._buffer):
226 data = self._buffer[self._buffer_offset : end]
227 self._buffer_offset = end
228 self._pos += len(data)
Nadeem Vawda9e2a28e2012-09-30 13:41:29 +0200229 return data if return_data else None
Nadeem Vawda6c573182012-09-30 03:57:33 +0200230
231 # The loop assumes that _buffer_offset is 0. Ensure that this is true.
232 self._buffer = self._buffer[self._buffer_offset:]
233 self._buffer_offset = 0
234
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200235 blocks = []
236 while n > 0 and self._fill_buffer():
237 if n < len(self._buffer):
238 data = self._buffer[:n]
Nadeem Vawda6c573182012-09-30 03:57:33 +0200239 self._buffer_offset = n
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200240 else:
241 data = self._buffer
Nadeem Vawda6c573182012-09-30 03:57:33 +0200242 self._buffer = b""
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200243 if return_data:
244 blocks.append(data)
245 self._pos += len(data)
246 n -= len(data)
247 if return_data:
248 return b"".join(blocks)
249
250 def peek(self, n=0):
251 """Return buffered data without advancing the file position.
252
253 Always returns at least one byte of data, unless at EOF.
254 The exact number of bytes returned is unspecified.
255 """
256 with self._lock:
257 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200258 if not self._fill_buffer():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200259 return b""
Nadeem Vawda6c573182012-09-30 03:57:33 +0200260 return self._buffer[self._buffer_offset:]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200261
262 def read(self, size=-1):
263 """Read up to size uncompressed bytes from the file.
264
265 If size is negative or omitted, read until EOF is reached.
266 Returns b'' if the file is already at EOF.
267 """
268 with self._lock:
269 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200270 if size == 0:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200271 return b""
272 elif size < 0:
273 return self._read_all()
274 else:
275 return self._read_block(size)
276
277 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200278 """Read up to size uncompressed bytes, while trying to avoid
279 making multiple reads from the underlying stream.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200280
281 Returns b'' if the file is at EOF.
282 """
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200283 # Usually, read1() calls _fp.read() at most once. However, sometimes
284 # this does not give enough data for the decompressor to make progress.
285 # In this case we make multiple reads, to avoid returning b"".
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200286 with self._lock:
287 self._check_can_read()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200288 if (size == 0 or
289 # Only call _fill_buffer() if the buffer is actually empty.
290 # This gives a significant speedup if *size* is small.
291 (self._buffer_offset == len(self._buffer) and not self._fill_buffer())):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200292 return b""
Nadeem Vawda6c573182012-09-30 03:57:33 +0200293 if size > 0:
294 data = self._buffer[self._buffer_offset :
295 self._buffer_offset + size]
296 self._buffer_offset += len(data)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200297 else:
Nadeem Vawda6c573182012-09-30 03:57:33 +0200298 data = self._buffer[self._buffer_offset:]
299 self._buffer = b""
300 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200301 self._pos += len(data)
302 return data
303
304 def readinto(self, b):
305 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200306
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200307 Returns the number of bytes read (0 for EOF).
308 """
309 with self._lock:
310 return io.BufferedIOBase.readinto(self, b)
311
312 def readline(self, size=-1):
313 """Read a line of uncompressed bytes from the file.
314
315 The terminating newline (if present) is retained. If size is
316 non-negative, no more than size bytes will be read (in which
317 case the line may be incomplete). Returns b'' if already at EOF.
318 """
319 if not hasattr(size, "__index__"):
320 raise TypeError("Integer argument expected")
321 size = size.__index__()
322 with self._lock:
Nadeem Vawda6c573182012-09-30 03:57:33 +0200323 # Shortcut for the common case - the whole line is in the buffer.
324 if size < 0:
325 end = self._buffer.find(b"\n", self._buffer_offset) + 1
326 if end > 0:
327 line = self._buffer[self._buffer_offset : end]
328 self._buffer_offset = end
329 self._pos += len(line)
330 return line
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200331 return io.BufferedIOBase.readline(self, size)
332
333 def readlines(self, size=-1):
334 """Read a list of lines of uncompressed bytes from the file.
335
336 size can be specified to control the number of lines read: no
337 further lines will be read once the total size of the lines read
338 so far equals or exceeds size.
339 """
340 if not hasattr(size, "__index__"):
341 raise TypeError("Integer argument expected")
342 size = size.__index__()
343 with self._lock:
344 return io.BufferedIOBase.readlines(self, size)
345
346 def write(self, data):
347 """Write a byte string to the file.
348
349 Returns the number of uncompressed bytes written, which is
350 always len(data). Note that due to buffering, the file on disk
351 may not reflect the data written until close() is called.
352 """
353 with self._lock:
354 self._check_can_write()
355 compressed = self._compressor.compress(data)
356 self._fp.write(compressed)
357 self._pos += len(data)
358 return len(data)
359
360 def writelines(self, seq):
361 """Write a sequence of byte strings to the file.
362
363 Returns the number of uncompressed bytes written.
364 seq can be any iterable yielding byte strings.
365
366 Line separators are not added between the written byte strings.
367 """
368 with self._lock:
369 return io.BufferedIOBase.writelines(self, seq)
370
371 # Rewind the file to the beginning of the data stream.
372 def _rewind(self):
373 self._fp.seek(0, 0)
374 self._mode = _MODE_READ
375 self._pos = 0
376 self._decompressor = BZ2Decompressor()
Nadeem Vawda6c573182012-09-30 03:57:33 +0200377 self._buffer = b""
378 self._buffer_offset = 0
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200379
380 def seek(self, offset, whence=0):
381 """Change the file position.
382
383 The new position is specified by offset, relative to the
384 position indicated by whence. Values for whence are:
385
386 0: start of stream (default); offset must not be negative
387 1: current stream position
388 2: end of stream; offset must not be positive
389
390 Returns the new file position.
391
392 Note that seeking is emulated, so depending on the parameters,
393 this operation may be extremely slow.
394 """
395 with self._lock:
396 self._check_can_seek()
397
398 # Recalculate offset as an absolute file position.
399 if whence == 0:
400 pass
401 elif whence == 1:
402 offset = self._pos + offset
403 elif whence == 2:
404 # Seeking relative to EOF - we need to know the file's size.
405 if self._size < 0:
406 self._read_all(return_data=False)
407 offset = self._size + offset
408 else:
409 raise ValueError("Invalid value for whence: {}".format(whence))
410
411 # Make it so that offset is the number of bytes to skip forward.
412 if offset < self._pos:
413 self._rewind()
414 else:
415 offset -= self._pos
416
417 # Read and discard data until we reach the desired position.
Nadeem Vawda6c573182012-09-30 03:57:33 +0200418 self._read_block(offset, return_data=False)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200419
420 return self._pos
421
422 def tell(self):
423 """Return the current file position."""
424 with self._lock:
425 self._check_not_closed()
426 return self._pos
427
428
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200429def open(filename, mode="rb", compresslevel=9,
430 encoding=None, errors=None, newline=None):
431 """Open a bzip2-compressed file in binary or text mode.
432
433 The filename argument can be an actual filename (a str or bytes object), or
434 an existing file object to read from or write to.
435
436 The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode,
437 or "rt", "wt" or "at" for text mode. The default mode is "rb", and the
438 default compresslevel is 9.
439
440 For binary mode, this function is equivalent to the BZ2File constructor:
441 BZ2File(filename, mode, compresslevel). In this case, the encoding, errors
442 and newline arguments must not be provided.
443
444 For text mode, a BZ2File object is created, and wrapped in an
445 io.TextIOWrapper instance with the specified encoding, error handling
446 behavior, and line ending(s).
447
448 """
449 if "t" in mode:
450 if "b" in mode:
451 raise ValueError("Invalid mode: %r" % (mode,))
452 else:
453 if encoding is not None:
454 raise ValueError("Argument 'encoding' not supported in binary mode")
455 if errors is not None:
456 raise ValueError("Argument 'errors' not supported in binary mode")
457 if newline is not None:
458 raise ValueError("Argument 'newline' not supported in binary mode")
459
460 bz_mode = mode.replace("t", "")
461 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
462
463 if "t" in mode:
464 return io.TextIOWrapper(binary_file, encoding, errors, newline)
465 else:
466 return binary_file
467
468
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200469def compress(data, compresslevel=9):
470 """Compress a block of data.
471
472 compresslevel, if given, must be a number between 1 and 9.
473
474 For incremental compression, use a BZ2Compressor object instead.
475 """
476 comp = BZ2Compressor(compresslevel)
477 return comp.compress(data) + comp.flush()
478
479
480def decompress(data):
481 """Decompress a block of data.
482
483 For incremental decompression, use a BZ2Decompressor object instead.
484 """
485 if len(data) == 0:
486 return b""
Nadeem Vawda55b43382011-05-27 01:52:15 +0200487
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200488 results = []
Nadeem Vawda55b43382011-05-27 01:52:15 +0200489 while True:
490 decomp = BZ2Decompressor()
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200491 results.append(decomp.decompress(data))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200492 if not decomp.eof:
493 raise ValueError("Compressed data ended before the "
494 "end-of-stream marker was reached")
495 if not decomp.unused_data:
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200496 return b"".join(results)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200497 # There is unused data left over. Proceed to next stream.
498 data = decomp.unused_data