blob: a50adf79f789411fb3b576a93ec4a8d1a66b050d [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020012import builtins
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020014import warnings
15
Nadeem Vawda72750a82012-01-18 01:57:14 +020016try:
17 from threading import RLock
18except ImportError:
19 from dummy_threading import RLock
20
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020021from _bz2 import BZ2Compressor, BZ2Decompressor
22
23
24_MODE_CLOSED = 0
25_MODE_READ = 1
26_MODE_READ_EOF = 2
27_MODE_WRITE = 3
28
29_BUFFER_SIZE = 8192
30
31
32class BZ2File(io.BufferedIOBase):
33
34 """A file object providing transparent bzip2 (de)compression.
35
36 A BZ2File can act as a wrapper for an existing file object, or refer
37 directly to a named file on disk.
38
39 Note that BZ2File provides a *binary* file interface - data read is
40 returned as bytes, and data to be written should be given as bytes.
41 """
42
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020043 def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020044 """Open a bzip2-compressed file.
45
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020046 If filename is a str or bytes object, is gives the name of the file to
47 be opened. Otherwise, it should be a file object, which will be used to
48 read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020049
Nadeem Vawda50cb9362012-06-04 23:31:22 +020050 mode can be 'r' for reading (default), 'w' for (over)writing, or 'a' for
51 appending. These can equivalently be given as 'rb', 'wb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020052
53 buffering is ignored. Its use is deprecated.
54
Nadeem Vawdacac89092012-02-04 13:08:11 +020055 If mode is 'w' or 'a', compresslevel can be a number between 1
56 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020057 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020058
59 If mode is 'r', the input file may be the concatenation of
60 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020061 """
62 # This lock must be recursive, so that BufferedIOBase's
63 # readline(), readlines() and writelines() don't deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020064 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 self._fp = None
66 self._closefp = False
67 self._mode = _MODE_CLOSED
68 self._pos = 0
69 self._size = -1
70
71 if buffering is not None:
72 warnings.warn("Use of 'buffering' argument is deprecated",
73 DeprecationWarning)
74
75 if not (1 <= compresslevel <= 9):
76 raise ValueError("compresslevel must be between 1 and 9")
77
78 if mode in ("", "r", "rb"):
79 mode = "rb"
80 mode_code = _MODE_READ
81 self._decompressor = BZ2Decompressor()
82 self._buffer = None
83 elif mode in ("w", "wb"):
84 mode = "wb"
85 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020086 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020087 elif mode in ("a", "ab"):
88 mode = "ab"
89 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020090 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020091 else:
92 raise ValueError("Invalid mode: {!r}".format(mode))
93
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020094 if isinstance(filename, (str, bytes)):
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020095 self._fp = builtins.open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020096 self._closefp = True
97 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020098 elif hasattr(filename, "read") or hasattr(filename, "write"):
99 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200100 self._mode = mode_code
101 else:
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +0200102 raise TypeError("filename must be a str or bytes object, or a file")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200103
104 def close(self):
105 """Flush and close the file.
106
107 May be called more than once without error. Once the file is
108 closed, any other operation on it will raise a ValueError.
109 """
110 with self._lock:
111 if self._mode == _MODE_CLOSED:
112 return
113 try:
114 if self._mode in (_MODE_READ, _MODE_READ_EOF):
115 self._decompressor = None
116 elif self._mode == _MODE_WRITE:
117 self._fp.write(self._compressor.flush())
118 self._compressor = None
119 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200120 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200121 if self._closefp:
122 self._fp.close()
123 finally:
124 self._fp = None
125 self._closefp = False
126 self._mode = _MODE_CLOSED
127 self._buffer = None
128
129 @property
130 def closed(self):
131 """True if this file is closed."""
132 return self._mode == _MODE_CLOSED
133
134 def fileno(self):
135 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200136 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200137 return self._fp.fileno()
138
139 def seekable(self):
140 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200141 return self.readable() and self._fp.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200142
143 def readable(self):
144 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200145 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200146 return self._mode in (_MODE_READ, _MODE_READ_EOF)
147
148 def writable(self):
149 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200150 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200151 return self._mode == _MODE_WRITE
152
153 # Mode-checking helper functions.
154
155 def _check_not_closed(self):
156 if self.closed:
157 raise ValueError("I/O operation on closed file")
158
159 def _check_can_read(self):
160 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200161 raise io.UnsupportedOperation("File not open for reading")
162
163 def _check_can_write(self):
164 if not self.writable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200165 raise io.UnsupportedOperation("File not open for writing")
166
167 def _check_can_seek(self):
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200168 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200169 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200170 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200171 if not self._fp.seekable():
172 raise io.UnsupportedOperation("The underlying file object "
173 "does not support seeking")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200174
175 # Fill the readahead buffer if it is empty. Returns False on EOF.
176 def _fill_buffer(self):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200177 # Depending on the input data, our call to the decompressor may not
178 # return any data. In this case, try again after reading another block.
179 while True:
180 if self._buffer:
181 return True
Nadeem Vawda55b43382011-05-27 01:52:15 +0200182
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200183 if self._decompressor.unused_data:
184 rawblock = self._decompressor.unused_data
Nadeem Vawda55b43382011-05-27 01:52:15 +0200185 else:
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200186 rawblock = self._fp.read(_BUFFER_SIZE)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200187
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200188 if not rawblock:
189 if self._decompressor.eof:
190 self._mode = _MODE_READ_EOF
191 self._size = self._pos
192 return False
193 else:
194 raise EOFError("Compressed file ended before the "
195 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200196
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200197 # Continue to next stream.
198 if self._decompressor.eof:
199 self._decompressor = BZ2Decompressor()
200
201 self._buffer = self._decompressor.decompress(rawblock)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200202
203 # Read data until EOF.
204 # If return_data is false, consume the data without returning it.
205 def _read_all(self, return_data=True):
206 blocks = []
207 while self._fill_buffer():
208 if return_data:
209 blocks.append(self._buffer)
210 self._pos += len(self._buffer)
211 self._buffer = None
212 if return_data:
213 return b"".join(blocks)
214
215 # Read a block of up to n bytes.
216 # If return_data is false, consume the data without returning it.
217 def _read_block(self, n, return_data=True):
218 blocks = []
219 while n > 0 and self._fill_buffer():
220 if n < len(self._buffer):
221 data = self._buffer[:n]
222 self._buffer = self._buffer[n:]
223 else:
224 data = self._buffer
225 self._buffer = None
226 if return_data:
227 blocks.append(data)
228 self._pos += len(data)
229 n -= len(data)
230 if return_data:
231 return b"".join(blocks)
232
233 def peek(self, n=0):
234 """Return buffered data without advancing the file position.
235
236 Always returns at least one byte of data, unless at EOF.
237 The exact number of bytes returned is unspecified.
238 """
239 with self._lock:
240 self._check_can_read()
241 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
242 return b""
243 return self._buffer
244
245 def read(self, size=-1):
246 """Read up to size uncompressed bytes from the file.
247
248 If size is negative or omitted, read until EOF is reached.
249 Returns b'' if the file is already at EOF.
250 """
251 with self._lock:
252 self._check_can_read()
253 if self._mode == _MODE_READ_EOF or size == 0:
254 return b""
255 elif size < 0:
256 return self._read_all()
257 else:
258 return self._read_block(size)
259
260 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200261 """Read up to size uncompressed bytes, while trying to avoid
262 making multiple reads from the underlying stream.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200263
264 Returns b'' if the file is at EOF.
265 """
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200266 # Usually, read1() calls _fp.read() at most once. However, sometimes
267 # this does not give enough data for the decompressor to make progress.
268 # In this case we make multiple reads, to avoid returning b"".
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200269 with self._lock:
270 self._check_can_read()
271 if (size == 0 or self._mode == _MODE_READ_EOF or
272 not self._fill_buffer()):
273 return b""
274 if 0 < size < len(self._buffer):
275 data = self._buffer[:size]
276 self._buffer = self._buffer[size:]
277 else:
278 data = self._buffer
279 self._buffer = None
280 self._pos += len(data)
281 return data
282
283 def readinto(self, b):
284 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200285
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200286 Returns the number of bytes read (0 for EOF).
287 """
288 with self._lock:
289 return io.BufferedIOBase.readinto(self, b)
290
291 def readline(self, size=-1):
292 """Read a line of uncompressed bytes from the file.
293
294 The terminating newline (if present) is retained. If size is
295 non-negative, no more than size bytes will be read (in which
296 case the line may be incomplete). Returns b'' if already at EOF.
297 """
298 if not hasattr(size, "__index__"):
299 raise TypeError("Integer argument expected")
300 size = size.__index__()
301 with self._lock:
302 return io.BufferedIOBase.readline(self, size)
303
304 def readlines(self, size=-1):
305 """Read a list of lines of uncompressed bytes from the file.
306
307 size can be specified to control the number of lines read: no
308 further lines will be read once the total size of the lines read
309 so far equals or exceeds size.
310 """
311 if not hasattr(size, "__index__"):
312 raise TypeError("Integer argument expected")
313 size = size.__index__()
314 with self._lock:
315 return io.BufferedIOBase.readlines(self, size)
316
317 def write(self, data):
318 """Write a byte string to the file.
319
320 Returns the number of uncompressed bytes written, which is
321 always len(data). Note that due to buffering, the file on disk
322 may not reflect the data written until close() is called.
323 """
324 with self._lock:
325 self._check_can_write()
326 compressed = self._compressor.compress(data)
327 self._fp.write(compressed)
328 self._pos += len(data)
329 return len(data)
330
331 def writelines(self, seq):
332 """Write a sequence of byte strings to the file.
333
334 Returns the number of uncompressed bytes written.
335 seq can be any iterable yielding byte strings.
336
337 Line separators are not added between the written byte strings.
338 """
339 with self._lock:
340 return io.BufferedIOBase.writelines(self, seq)
341
342 # Rewind the file to the beginning of the data stream.
343 def _rewind(self):
344 self._fp.seek(0, 0)
345 self._mode = _MODE_READ
346 self._pos = 0
347 self._decompressor = BZ2Decompressor()
348 self._buffer = None
349
350 def seek(self, offset, whence=0):
351 """Change the file position.
352
353 The new position is specified by offset, relative to the
354 position indicated by whence. Values for whence are:
355
356 0: start of stream (default); offset must not be negative
357 1: current stream position
358 2: end of stream; offset must not be positive
359
360 Returns the new file position.
361
362 Note that seeking is emulated, so depending on the parameters,
363 this operation may be extremely slow.
364 """
365 with self._lock:
366 self._check_can_seek()
367
368 # Recalculate offset as an absolute file position.
369 if whence == 0:
370 pass
371 elif whence == 1:
372 offset = self._pos + offset
373 elif whence == 2:
374 # Seeking relative to EOF - we need to know the file's size.
375 if self._size < 0:
376 self._read_all(return_data=False)
377 offset = self._size + offset
378 else:
379 raise ValueError("Invalid value for whence: {}".format(whence))
380
381 # Make it so that offset is the number of bytes to skip forward.
382 if offset < self._pos:
383 self._rewind()
384 else:
385 offset -= self._pos
386
387 # Read and discard data until we reach the desired position.
388 if self._mode != _MODE_READ_EOF:
389 self._read_block(offset, return_data=False)
390
391 return self._pos
392
393 def tell(self):
394 """Return the current file position."""
395 with self._lock:
396 self._check_not_closed()
397 return self._pos
398
399
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200400def open(filename, mode="rb", compresslevel=9,
401 encoding=None, errors=None, newline=None):
402 """Open a bzip2-compressed file in binary or text mode.
403
404 The filename argument can be an actual filename (a str or bytes object), or
405 an existing file object to read from or write to.
406
407 The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode,
408 or "rt", "wt" or "at" for text mode. The default mode is "rb", and the
409 default compresslevel is 9.
410
411 For binary mode, this function is equivalent to the BZ2File constructor:
412 BZ2File(filename, mode, compresslevel). In this case, the encoding, errors
413 and newline arguments must not be provided.
414
415 For text mode, a BZ2File object is created, and wrapped in an
416 io.TextIOWrapper instance with the specified encoding, error handling
417 behavior, and line ending(s).
418
419 """
420 if "t" in mode:
421 if "b" in mode:
422 raise ValueError("Invalid mode: %r" % (mode,))
423 else:
424 if encoding is not None:
425 raise ValueError("Argument 'encoding' not supported in binary mode")
426 if errors is not None:
427 raise ValueError("Argument 'errors' not supported in binary mode")
428 if newline is not None:
429 raise ValueError("Argument 'newline' not supported in binary mode")
430
431 bz_mode = mode.replace("t", "")
432 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
433
434 if "t" in mode:
435 return io.TextIOWrapper(binary_file, encoding, errors, newline)
436 else:
437 return binary_file
438
439
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200440def compress(data, compresslevel=9):
441 """Compress a block of data.
442
443 compresslevel, if given, must be a number between 1 and 9.
444
445 For incremental compression, use a BZ2Compressor object instead.
446 """
447 comp = BZ2Compressor(compresslevel)
448 return comp.compress(data) + comp.flush()
449
450
451def decompress(data):
452 """Decompress a block of data.
453
454 For incremental decompression, use a BZ2Decompressor object instead.
455 """
456 if len(data) == 0:
457 return b""
Nadeem Vawda55b43382011-05-27 01:52:15 +0200458
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200459 results = []
Nadeem Vawda55b43382011-05-27 01:52:15 +0200460 while True:
461 decomp = BZ2Decompressor()
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200462 results.append(decomp.decompress(data))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200463 if not decomp.eof:
464 raise ValueError("Compressed data ended before the "
465 "end-of-stream marker was reached")
466 if not decomp.unused_data:
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200467 return b"".join(results)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200468 # There is unused data left over. Proceed to next stream.
469 data = decomp.unused_data