blob: 2e808cd37437e46076efcdada34004aecb9da145 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020012import builtins
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020014import warnings
15
Nadeem Vawda72750a82012-01-18 01:57:14 +020016try:
17 from threading import RLock
18except ImportError:
19 from dummy_threading import RLock
20
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020021from _bz2 import BZ2Compressor, BZ2Decompressor
22
23
24_MODE_CLOSED = 0
25_MODE_READ = 1
26_MODE_READ_EOF = 2
27_MODE_WRITE = 3
28
29_BUFFER_SIZE = 8192
30
31
32class BZ2File(io.BufferedIOBase):
33
34 """A file object providing transparent bzip2 (de)compression.
35
36 A BZ2File can act as a wrapper for an existing file object, or refer
37 directly to a named file on disk.
38
39 Note that BZ2File provides a *binary* file interface - data read is
40 returned as bytes, and data to be written should be given as bytes.
41 """
42
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020043 def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020044 """Open a bzip2-compressed file.
45
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020046 If filename is a str or bytes object, is gives the name of the file to
47 be opened. Otherwise, it should be a file object, which will be used to
48 read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020049
Nadeem Vawda50cb9362012-06-04 23:31:22 +020050 mode can be 'r' for reading (default), 'w' for (over)writing, or 'a' for
51 appending. These can equivalently be given as 'rb', 'wb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020052
53 buffering is ignored. Its use is deprecated.
54
Nadeem Vawdacac89092012-02-04 13:08:11 +020055 If mode is 'w' or 'a', compresslevel can be a number between 1
56 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020057 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020058
59 If mode is 'r', the input file may be the concatenation of
60 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020061 """
62 # This lock must be recursive, so that BufferedIOBase's
63 # readline(), readlines() and writelines() don't deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020064 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 self._fp = None
66 self._closefp = False
67 self._mode = _MODE_CLOSED
68 self._pos = 0
69 self._size = -1
70
71 if buffering is not None:
72 warnings.warn("Use of 'buffering' argument is deprecated",
73 DeprecationWarning)
74
75 if not (1 <= compresslevel <= 9):
76 raise ValueError("compresslevel must be between 1 and 9")
77
78 if mode in ("", "r", "rb"):
79 mode = "rb"
80 mode_code = _MODE_READ
81 self._decompressor = BZ2Decompressor()
82 self._buffer = None
83 elif mode in ("w", "wb"):
84 mode = "wb"
85 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020086 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020087 elif mode in ("a", "ab"):
88 mode = "ab"
89 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020090 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020091 else:
92 raise ValueError("Invalid mode: {!r}".format(mode))
93
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020094 if isinstance(filename, (str, bytes)):
Nadeem Vawdaaf518c12012-06-04 23:32:38 +020095 self._fp = builtins.open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020096 self._closefp = True
97 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020098 elif hasattr(filename, "read") or hasattr(filename, "write"):
99 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200100 self._mode = mode_code
101 else:
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +0200102 raise TypeError("filename must be a str or bytes object, or a file")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200103
104 def close(self):
105 """Flush and close the file.
106
107 May be called more than once without error. Once the file is
108 closed, any other operation on it will raise a ValueError.
109 """
110 with self._lock:
111 if self._mode == _MODE_CLOSED:
112 return
113 try:
114 if self._mode in (_MODE_READ, _MODE_READ_EOF):
115 self._decompressor = None
116 elif self._mode == _MODE_WRITE:
117 self._fp.write(self._compressor.flush())
118 self._compressor = None
119 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200120 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200121 if self._closefp:
122 self._fp.close()
123 finally:
124 self._fp = None
125 self._closefp = False
126 self._mode = _MODE_CLOSED
127 self._buffer = None
128
129 @property
130 def closed(self):
131 """True if this file is closed."""
132 return self._mode == _MODE_CLOSED
133
134 def fileno(self):
135 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200136 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200137 return self._fp.fileno()
138
139 def seekable(self):
140 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200141 return self.readable() and self._fp.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200142
143 def readable(self):
144 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200145 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200146 return self._mode in (_MODE_READ, _MODE_READ_EOF)
147
148 def writable(self):
149 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200150 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200151 return self._mode == _MODE_WRITE
152
153 # Mode-checking helper functions.
154
155 def _check_not_closed(self):
156 if self.closed:
157 raise ValueError("I/O operation on closed file")
158
159 def _check_can_read(self):
160 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200161 raise io.UnsupportedOperation("File not open for reading")
162
163 def _check_can_write(self):
164 if not self.writable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200165 raise io.UnsupportedOperation("File not open for writing")
166
167 def _check_can_seek(self):
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200168 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200169 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200170 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200171 if not self._fp.seekable():
172 raise io.UnsupportedOperation("The underlying file object "
173 "does not support seeking")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200174
175 # Fill the readahead buffer if it is empty. Returns False on EOF.
176 def _fill_buffer(self):
177 if self._buffer:
178 return True
Nadeem Vawda55b43382011-05-27 01:52:15 +0200179
180 if self._decompressor.unused_data:
181 rawblock = self._decompressor.unused_data
182 else:
183 rawblock = self._fp.read(_BUFFER_SIZE)
184
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200185 if not rawblock:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200186 if self._decompressor.eof:
187 self._mode = _MODE_READ_EOF
188 self._size = self._pos
189 return False
190 else:
191 raise EOFError("Compressed file ended before the "
192 "end-of-stream marker was reached")
193
194 # Continue to next stream.
195 if self._decompressor.eof:
196 self._decompressor = BZ2Decompressor()
197
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200198 self._buffer = self._decompressor.decompress(rawblock)
199 return True
200
201 # Read data until EOF.
202 # If return_data is false, consume the data without returning it.
203 def _read_all(self, return_data=True):
204 blocks = []
205 while self._fill_buffer():
206 if return_data:
207 blocks.append(self._buffer)
208 self._pos += len(self._buffer)
209 self._buffer = None
210 if return_data:
211 return b"".join(blocks)
212
213 # Read a block of up to n bytes.
214 # If return_data is false, consume the data without returning it.
215 def _read_block(self, n, return_data=True):
216 blocks = []
217 while n > 0 and self._fill_buffer():
218 if n < len(self._buffer):
219 data = self._buffer[:n]
220 self._buffer = self._buffer[n:]
221 else:
222 data = self._buffer
223 self._buffer = None
224 if return_data:
225 blocks.append(data)
226 self._pos += len(data)
227 n -= len(data)
228 if return_data:
229 return b"".join(blocks)
230
231 def peek(self, n=0):
232 """Return buffered data without advancing the file position.
233
234 Always returns at least one byte of data, unless at EOF.
235 The exact number of bytes returned is unspecified.
236 """
237 with self._lock:
238 self._check_can_read()
239 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
240 return b""
241 return self._buffer
242
243 def read(self, size=-1):
244 """Read up to size uncompressed bytes from the file.
245
246 If size is negative or omitted, read until EOF is reached.
247 Returns b'' if the file is already at EOF.
248 """
249 with self._lock:
250 self._check_can_read()
251 if self._mode == _MODE_READ_EOF or size == 0:
252 return b""
253 elif size < 0:
254 return self._read_all()
255 else:
256 return self._read_block(size)
257
258 def read1(self, size=-1):
259 """Read up to size uncompressed bytes with at most one read
260 from the underlying stream.
261
262 Returns b'' if the file is at EOF.
263 """
264 with self._lock:
265 self._check_can_read()
266 if (size == 0 or self._mode == _MODE_READ_EOF or
267 not self._fill_buffer()):
268 return b""
269 if 0 < size < len(self._buffer):
270 data = self._buffer[:size]
271 self._buffer = self._buffer[size:]
272 else:
273 data = self._buffer
274 self._buffer = None
275 self._pos += len(data)
276 return data
277
278 def readinto(self, b):
279 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200280
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200281 Returns the number of bytes read (0 for EOF).
282 """
283 with self._lock:
284 return io.BufferedIOBase.readinto(self, b)
285
286 def readline(self, size=-1):
287 """Read a line of uncompressed bytes from the file.
288
289 The terminating newline (if present) is retained. If size is
290 non-negative, no more than size bytes will be read (in which
291 case the line may be incomplete). Returns b'' if already at EOF.
292 """
293 if not hasattr(size, "__index__"):
294 raise TypeError("Integer argument expected")
295 size = size.__index__()
296 with self._lock:
297 return io.BufferedIOBase.readline(self, size)
298
299 def readlines(self, size=-1):
300 """Read a list of lines of uncompressed bytes from the file.
301
302 size can be specified to control the number of lines read: no
303 further lines will be read once the total size of the lines read
304 so far equals or exceeds size.
305 """
306 if not hasattr(size, "__index__"):
307 raise TypeError("Integer argument expected")
308 size = size.__index__()
309 with self._lock:
310 return io.BufferedIOBase.readlines(self, size)
311
312 def write(self, data):
313 """Write a byte string to the file.
314
315 Returns the number of uncompressed bytes written, which is
316 always len(data). Note that due to buffering, the file on disk
317 may not reflect the data written until close() is called.
318 """
319 with self._lock:
320 self._check_can_write()
321 compressed = self._compressor.compress(data)
322 self._fp.write(compressed)
323 self._pos += len(data)
324 return len(data)
325
326 def writelines(self, seq):
327 """Write a sequence of byte strings to the file.
328
329 Returns the number of uncompressed bytes written.
330 seq can be any iterable yielding byte strings.
331
332 Line separators are not added between the written byte strings.
333 """
334 with self._lock:
335 return io.BufferedIOBase.writelines(self, seq)
336
337 # Rewind the file to the beginning of the data stream.
338 def _rewind(self):
339 self._fp.seek(0, 0)
340 self._mode = _MODE_READ
341 self._pos = 0
342 self._decompressor = BZ2Decompressor()
343 self._buffer = None
344
345 def seek(self, offset, whence=0):
346 """Change the file position.
347
348 The new position is specified by offset, relative to the
349 position indicated by whence. Values for whence are:
350
351 0: start of stream (default); offset must not be negative
352 1: current stream position
353 2: end of stream; offset must not be positive
354
355 Returns the new file position.
356
357 Note that seeking is emulated, so depending on the parameters,
358 this operation may be extremely slow.
359 """
360 with self._lock:
361 self._check_can_seek()
362
363 # Recalculate offset as an absolute file position.
364 if whence == 0:
365 pass
366 elif whence == 1:
367 offset = self._pos + offset
368 elif whence == 2:
369 # Seeking relative to EOF - we need to know the file's size.
370 if self._size < 0:
371 self._read_all(return_data=False)
372 offset = self._size + offset
373 else:
374 raise ValueError("Invalid value for whence: {}".format(whence))
375
376 # Make it so that offset is the number of bytes to skip forward.
377 if offset < self._pos:
378 self._rewind()
379 else:
380 offset -= self._pos
381
382 # Read and discard data until we reach the desired position.
383 if self._mode != _MODE_READ_EOF:
384 self._read_block(offset, return_data=False)
385
386 return self._pos
387
388 def tell(self):
389 """Return the current file position."""
390 with self._lock:
391 self._check_not_closed()
392 return self._pos
393
394
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200395def open(filename, mode="rb", compresslevel=9,
396 encoding=None, errors=None, newline=None):
397 """Open a bzip2-compressed file in binary or text mode.
398
399 The filename argument can be an actual filename (a str or bytes object), or
400 an existing file object to read from or write to.
401
402 The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode,
403 or "rt", "wt" or "at" for text mode. The default mode is "rb", and the
404 default compresslevel is 9.
405
406 For binary mode, this function is equivalent to the BZ2File constructor:
407 BZ2File(filename, mode, compresslevel). In this case, the encoding, errors
408 and newline arguments must not be provided.
409
410 For text mode, a BZ2File object is created, and wrapped in an
411 io.TextIOWrapper instance with the specified encoding, error handling
412 behavior, and line ending(s).
413
414 """
415 if "t" in mode:
416 if "b" in mode:
417 raise ValueError("Invalid mode: %r" % (mode,))
418 else:
419 if encoding is not None:
420 raise ValueError("Argument 'encoding' not supported in binary mode")
421 if errors is not None:
422 raise ValueError("Argument 'errors' not supported in binary mode")
423 if newline is not None:
424 raise ValueError("Argument 'newline' not supported in binary mode")
425
426 bz_mode = mode.replace("t", "")
427 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
428
429 if "t" in mode:
430 return io.TextIOWrapper(binary_file, encoding, errors, newline)
431 else:
432 return binary_file
433
434
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200435def compress(data, compresslevel=9):
436 """Compress a block of data.
437
438 compresslevel, if given, must be a number between 1 and 9.
439
440 For incremental compression, use a BZ2Compressor object instead.
441 """
442 comp = BZ2Compressor(compresslevel)
443 return comp.compress(data) + comp.flush()
444
445
446def decompress(data):
447 """Decompress a block of data.
448
449 For incremental decompression, use a BZ2Decompressor object instead.
450 """
451 if len(data) == 0:
452 return b""
Nadeem Vawda55b43382011-05-27 01:52:15 +0200453
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200454 results = []
Nadeem Vawda55b43382011-05-27 01:52:15 +0200455 while True:
456 decomp = BZ2Decompressor()
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200457 results.append(decomp.decompress(data))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200458 if not decomp.eof:
459 raise ValueError("Compressed data ended before the "
460 "end-of-stream marker was reached")
461 if not decomp.unused_data:
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200462 return b"".join(results)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200463 # There is unused data left over. Proceed to next stream.
464 data = decomp.unused_data