blob: 51b9ac438856aae4bba0f184a1226b0e371408aa [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", "compress",
8 "decompress"]
9
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
12import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import warnings
14
Nadeem Vawda72750a82012-01-18 01:57:14 +020015try:
16 from threading import RLock
17except ImportError:
18 from dummy_threading import RLock
19
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020020from _bz2 import BZ2Compressor, BZ2Decompressor
21
22
23_MODE_CLOSED = 0
24_MODE_READ = 1
25_MODE_READ_EOF = 2
26_MODE_WRITE = 3
27
28_BUFFER_SIZE = 8192
29
30
31class BZ2File(io.BufferedIOBase):
32
33 """A file object providing transparent bzip2 (de)compression.
34
35 A BZ2File can act as a wrapper for an existing file object, or refer
36 directly to a named file on disk.
37
38 Note that BZ2File provides a *binary* file interface - data read is
39 returned as bytes, and data to be written should be given as bytes.
40 """
41
42 def __init__(self, filename=None, mode="r", buffering=None,
Nadeem Vawda54d81442012-02-04 13:58:07 +020043 compresslevel=9, *, fileobj=None):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020044 """Open a bzip2-compressed file.
45
46 If filename is given, open the named file. Otherwise, operate on
47 the file object given by fileobj. Exactly one of these two
48 parameters should be provided.
49
Nadeem Vawdacac89092012-02-04 13:08:11 +020050 mode can be 'r' for reading (default), 'w' for (over)writing, or
51 'a' for appending.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020052
53 buffering is ignored. Its use is deprecated.
54
Nadeem Vawdacac89092012-02-04 13:08:11 +020055 If mode is 'w' or 'a', compresslevel can be a number between 1
56 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020057 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020058
59 If mode is 'r', the input file may be the concatenation of
60 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020061 """
62 # This lock must be recursive, so that BufferedIOBase's
63 # readline(), readlines() and writelines() don't deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020064 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 self._fp = None
66 self._closefp = False
67 self._mode = _MODE_CLOSED
68 self._pos = 0
69 self._size = -1
70
71 if buffering is not None:
72 warnings.warn("Use of 'buffering' argument is deprecated",
73 DeprecationWarning)
74
75 if not (1 <= compresslevel <= 9):
76 raise ValueError("compresslevel must be between 1 and 9")
77
78 if mode in ("", "r", "rb"):
79 mode = "rb"
80 mode_code = _MODE_READ
81 self._decompressor = BZ2Decompressor()
82 self._buffer = None
83 elif mode in ("w", "wb"):
84 mode = "wb"
85 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020086 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020087 elif mode in ("a", "ab"):
88 mode = "ab"
89 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020090 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020091 else:
92 raise ValueError("Invalid mode: {!r}".format(mode))
93
94 if filename is not None and fileobj is None:
95 self._fp = open(filename, mode)
96 self._closefp = True
97 self._mode = mode_code
98 elif fileobj is not None and filename is None:
99 self._fp = fileobj
100 self._mode = mode_code
101 else:
102 raise ValueError("Must give exactly one of filename and fileobj")
103
104 def close(self):
105 """Flush and close the file.
106
107 May be called more than once without error. Once the file is
108 closed, any other operation on it will raise a ValueError.
109 """
110 with self._lock:
111 if self._mode == _MODE_CLOSED:
112 return
113 try:
114 if self._mode in (_MODE_READ, _MODE_READ_EOF):
115 self._decompressor = None
116 elif self._mode == _MODE_WRITE:
117 self._fp.write(self._compressor.flush())
118 self._compressor = None
119 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200120 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200121 if self._closefp:
122 self._fp.close()
123 finally:
124 self._fp = None
125 self._closefp = False
126 self._mode = _MODE_CLOSED
127 self._buffer = None
128
129 @property
130 def closed(self):
131 """True if this file is closed."""
132 return self._mode == _MODE_CLOSED
133
134 def fileno(self):
135 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200136 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200137 return self._fp.fileno()
138
139 def seekable(self):
140 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200141 return self.readable() and self._fp.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200142
143 def readable(self):
144 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200145 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200146 return self._mode in (_MODE_READ, _MODE_READ_EOF)
147
148 def writable(self):
149 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200150 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200151 return self._mode == _MODE_WRITE
152
153 # Mode-checking helper functions.
154
155 def _check_not_closed(self):
156 if self.closed:
157 raise ValueError("I/O operation on closed file")
158
159 def _check_can_read(self):
160 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200161 raise io.UnsupportedOperation("File not open for reading")
162
163 def _check_can_write(self):
164 if not self.writable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200165 raise io.UnsupportedOperation("File not open for writing")
166
167 def _check_can_seek(self):
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200168 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200169 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200170 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200171 if not self._fp.seekable():
172 raise io.UnsupportedOperation("The underlying file object "
173 "does not support seeking")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200174
175 # Fill the readahead buffer if it is empty. Returns False on EOF.
176 def _fill_buffer(self):
177 if self._buffer:
178 return True
Nadeem Vawda55b43382011-05-27 01:52:15 +0200179
180 if self._decompressor.unused_data:
181 rawblock = self._decompressor.unused_data
182 else:
183 rawblock = self._fp.read(_BUFFER_SIZE)
184
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200185 if not rawblock:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200186 if self._decompressor.eof:
187 self._mode = _MODE_READ_EOF
188 self._size = self._pos
189 return False
190 else:
191 raise EOFError("Compressed file ended before the "
192 "end-of-stream marker was reached")
193
194 # Continue to next stream.
195 if self._decompressor.eof:
196 self._decompressor = BZ2Decompressor()
197
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200198 self._buffer = self._decompressor.decompress(rawblock)
199 return True
200
201 # Read data until EOF.
202 # If return_data is false, consume the data without returning it.
203 def _read_all(self, return_data=True):
204 blocks = []
205 while self._fill_buffer():
206 if return_data:
207 blocks.append(self._buffer)
208 self._pos += len(self._buffer)
209 self._buffer = None
210 if return_data:
211 return b"".join(blocks)
212
213 # Read a block of up to n bytes.
214 # If return_data is false, consume the data without returning it.
215 def _read_block(self, n, return_data=True):
216 blocks = []
217 while n > 0 and self._fill_buffer():
218 if n < len(self._buffer):
219 data = self._buffer[:n]
220 self._buffer = self._buffer[n:]
221 else:
222 data = self._buffer
223 self._buffer = None
224 if return_data:
225 blocks.append(data)
226 self._pos += len(data)
227 n -= len(data)
228 if return_data:
229 return b"".join(blocks)
230
231 def peek(self, n=0):
232 """Return buffered data without advancing the file position.
233
234 Always returns at least one byte of data, unless at EOF.
235 The exact number of bytes returned is unspecified.
236 """
237 with self._lock:
238 self._check_can_read()
239 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
240 return b""
241 return self._buffer
242
243 def read(self, size=-1):
244 """Read up to size uncompressed bytes from the file.
245
246 If size is negative or omitted, read until EOF is reached.
247 Returns b'' if the file is already at EOF.
248 """
249 with self._lock:
250 self._check_can_read()
251 if self._mode == _MODE_READ_EOF or size == 0:
252 return b""
253 elif size < 0:
254 return self._read_all()
255 else:
256 return self._read_block(size)
257
258 def read1(self, size=-1):
259 """Read up to size uncompressed bytes with at most one read
260 from the underlying stream.
261
262 Returns b'' if the file is at EOF.
263 """
264 with self._lock:
265 self._check_can_read()
266 if (size == 0 or self._mode == _MODE_READ_EOF or
267 not self._fill_buffer()):
268 return b""
269 if 0 < size < len(self._buffer):
270 data = self._buffer[:size]
271 self._buffer = self._buffer[size:]
272 else:
273 data = self._buffer
274 self._buffer = None
275 self._pos += len(data)
276 return data
277
278 def readinto(self, b):
279 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200280
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200281 Returns the number of bytes read (0 for EOF).
282 """
283 with self._lock:
284 return io.BufferedIOBase.readinto(self, b)
285
286 def readline(self, size=-1):
287 """Read a line of uncompressed bytes from the file.
288
289 The terminating newline (if present) is retained. If size is
290 non-negative, no more than size bytes will be read (in which
291 case the line may be incomplete). Returns b'' if already at EOF.
292 """
293 if not hasattr(size, "__index__"):
294 raise TypeError("Integer argument expected")
295 size = size.__index__()
296 with self._lock:
297 return io.BufferedIOBase.readline(self, size)
298
299 def readlines(self, size=-1):
300 """Read a list of lines of uncompressed bytes from the file.
301
302 size can be specified to control the number of lines read: no
303 further lines will be read once the total size of the lines read
304 so far equals or exceeds size.
305 """
306 if not hasattr(size, "__index__"):
307 raise TypeError("Integer argument expected")
308 size = size.__index__()
309 with self._lock:
310 return io.BufferedIOBase.readlines(self, size)
311
312 def write(self, data):
313 """Write a byte string to the file.
314
315 Returns the number of uncompressed bytes written, which is
316 always len(data). Note that due to buffering, the file on disk
317 may not reflect the data written until close() is called.
318 """
319 with self._lock:
320 self._check_can_write()
321 compressed = self._compressor.compress(data)
322 self._fp.write(compressed)
323 self._pos += len(data)
324 return len(data)
325
326 def writelines(self, seq):
327 """Write a sequence of byte strings to the file.
328
329 Returns the number of uncompressed bytes written.
330 seq can be any iterable yielding byte strings.
331
332 Line separators are not added between the written byte strings.
333 """
334 with self._lock:
335 return io.BufferedIOBase.writelines(self, seq)
336
337 # Rewind the file to the beginning of the data stream.
338 def _rewind(self):
339 self._fp.seek(0, 0)
340 self._mode = _MODE_READ
341 self._pos = 0
342 self._decompressor = BZ2Decompressor()
343 self._buffer = None
344
345 def seek(self, offset, whence=0):
346 """Change the file position.
347
348 The new position is specified by offset, relative to the
349 position indicated by whence. Values for whence are:
350
351 0: start of stream (default); offset must not be negative
352 1: current stream position
353 2: end of stream; offset must not be positive
354
355 Returns the new file position.
356
357 Note that seeking is emulated, so depending on the parameters,
358 this operation may be extremely slow.
359 """
360 with self._lock:
361 self._check_can_seek()
362
363 # Recalculate offset as an absolute file position.
364 if whence == 0:
365 pass
366 elif whence == 1:
367 offset = self._pos + offset
368 elif whence == 2:
369 # Seeking relative to EOF - we need to know the file's size.
370 if self._size < 0:
371 self._read_all(return_data=False)
372 offset = self._size + offset
373 else:
374 raise ValueError("Invalid value for whence: {}".format(whence))
375
376 # Make it so that offset is the number of bytes to skip forward.
377 if offset < self._pos:
378 self._rewind()
379 else:
380 offset -= self._pos
381
382 # Read and discard data until we reach the desired position.
383 if self._mode != _MODE_READ_EOF:
384 self._read_block(offset, return_data=False)
385
386 return self._pos
387
388 def tell(self):
389 """Return the current file position."""
390 with self._lock:
391 self._check_not_closed()
392 return self._pos
393
394
395def compress(data, compresslevel=9):
396 """Compress a block of data.
397
398 compresslevel, if given, must be a number between 1 and 9.
399
400 For incremental compression, use a BZ2Compressor object instead.
401 """
402 comp = BZ2Compressor(compresslevel)
403 return comp.compress(data) + comp.flush()
404
405
406def decompress(data):
407 """Decompress a block of data.
408
409 For incremental decompression, use a BZ2Decompressor object instead.
410 """
411 if len(data) == 0:
412 return b""
Nadeem Vawda55b43382011-05-27 01:52:15 +0200413
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200414 results = []
Nadeem Vawda55b43382011-05-27 01:52:15 +0200415 while True:
416 decomp = BZ2Decompressor()
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200417 results.append(decomp.decompress(data))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200418 if not decomp.eof:
419 raise ValueError("Compressed data ended before the "
420 "end-of-stream marker was reached")
421 if not decomp.unused_data:
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200422 return b"".join(results)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200423 # There is unused data left over. Proceed to next stream.
424 data = decomp.unused_data