blob: ae59407c1ac2d365619b606debfcae4819758ca6 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", "compress",
8 "decompress"]
9
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
12import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import warnings
14
Nadeem Vawda72750a82012-01-18 01:57:14 +020015try:
16 from threading import RLock
17except ImportError:
18 from dummy_threading import RLock
19
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020020from _bz2 import BZ2Compressor, BZ2Decompressor
21
22
23_MODE_CLOSED = 0
24_MODE_READ = 1
25_MODE_READ_EOF = 2
26_MODE_WRITE = 3
27
28_BUFFER_SIZE = 8192
29
30
31class BZ2File(io.BufferedIOBase):
32
33 """A file object providing transparent bzip2 (de)compression.
34
35 A BZ2File can act as a wrapper for an existing file object, or refer
36 directly to a named file on disk.
37
38 Note that BZ2File provides a *binary* file interface - data read is
39 returned as bytes, and data to be written should be given as bytes.
40 """
41
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020042 def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020043 """Open a bzip2-compressed file.
44
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020045 If filename is a str or bytes object, is gives the name of the file to
46 be opened. Otherwise, it should be a file object, which will be used to
47 read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020048
Nadeem Vawdacac89092012-02-04 13:08:11 +020049 mode can be 'r' for reading (default), 'w' for (over)writing, or
50 'a' for appending.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020051
52 buffering is ignored. Its use is deprecated.
53
Nadeem Vawdacac89092012-02-04 13:08:11 +020054 If mode is 'w' or 'a', compresslevel can be a number between 1
55 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020056 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020057
58 If mode is 'r', the input file may be the concatenation of
59 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020060 """
61 # This lock must be recursive, so that BufferedIOBase's
62 # readline(), readlines() and writelines() don't deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020063 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020064 self._fp = None
65 self._closefp = False
66 self._mode = _MODE_CLOSED
67 self._pos = 0
68 self._size = -1
69
70 if buffering is not None:
71 warnings.warn("Use of 'buffering' argument is deprecated",
72 DeprecationWarning)
73
74 if not (1 <= compresslevel <= 9):
75 raise ValueError("compresslevel must be between 1 and 9")
76
77 if mode in ("", "r", "rb"):
78 mode = "rb"
79 mode_code = _MODE_READ
80 self._decompressor = BZ2Decompressor()
81 self._buffer = None
82 elif mode in ("w", "wb"):
83 mode = "wb"
84 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020085 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020086 elif mode in ("a", "ab"):
87 mode = "ab"
88 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020089 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020090 else:
91 raise ValueError("Invalid mode: {!r}".format(mode))
92
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020093 if isinstance(filename, (str, bytes)):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020094 self._fp = open(filename, mode)
95 self._closefp = True
96 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020097 elif hasattr(filename, "read") or hasattr(filename, "write"):
98 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020099 self._mode = mode_code
100 else:
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +0200101 raise TypeError("filename must be a str or bytes object, or a file")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200102
103 def close(self):
104 """Flush and close the file.
105
106 May be called more than once without error. Once the file is
107 closed, any other operation on it will raise a ValueError.
108 """
109 with self._lock:
110 if self._mode == _MODE_CLOSED:
111 return
112 try:
113 if self._mode in (_MODE_READ, _MODE_READ_EOF):
114 self._decompressor = None
115 elif self._mode == _MODE_WRITE:
116 self._fp.write(self._compressor.flush())
117 self._compressor = None
118 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200119 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200120 if self._closefp:
121 self._fp.close()
122 finally:
123 self._fp = None
124 self._closefp = False
125 self._mode = _MODE_CLOSED
126 self._buffer = None
127
128 @property
129 def closed(self):
130 """True if this file is closed."""
131 return self._mode == _MODE_CLOSED
132
133 def fileno(self):
134 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200135 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200136 return self._fp.fileno()
137
138 def seekable(self):
139 """Return whether the file supports seeking."""
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200140 return self.readable() and self._fp.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200141
142 def readable(self):
143 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200144 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200145 return self._mode in (_MODE_READ, _MODE_READ_EOF)
146
147 def writable(self):
148 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200149 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200150 return self._mode == _MODE_WRITE
151
152 # Mode-checking helper functions.
153
154 def _check_not_closed(self):
155 if self.closed:
156 raise ValueError("I/O operation on closed file")
157
158 def _check_can_read(self):
159 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200160 raise io.UnsupportedOperation("File not open for reading")
161
162 def _check_can_write(self):
163 if not self.writable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200164 raise io.UnsupportedOperation("File not open for writing")
165
166 def _check_can_seek(self):
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200167 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200168 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200169 "on files open for reading")
Nadeem Vawdaae557d72012-02-12 01:51:38 +0200170 if not self._fp.seekable():
171 raise io.UnsupportedOperation("The underlying file object "
172 "does not support seeking")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200173
174 # Fill the readahead buffer if it is empty. Returns False on EOF.
175 def _fill_buffer(self):
176 if self._buffer:
177 return True
Nadeem Vawda55b43382011-05-27 01:52:15 +0200178
179 if self._decompressor.unused_data:
180 rawblock = self._decompressor.unused_data
181 else:
182 rawblock = self._fp.read(_BUFFER_SIZE)
183
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200184 if not rawblock:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200185 if self._decompressor.eof:
186 self._mode = _MODE_READ_EOF
187 self._size = self._pos
188 return False
189 else:
190 raise EOFError("Compressed file ended before the "
191 "end-of-stream marker was reached")
192
193 # Continue to next stream.
194 if self._decompressor.eof:
195 self._decompressor = BZ2Decompressor()
196
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200197 self._buffer = self._decompressor.decompress(rawblock)
198 return True
199
200 # Read data until EOF.
201 # If return_data is false, consume the data without returning it.
202 def _read_all(self, return_data=True):
203 blocks = []
204 while self._fill_buffer():
205 if return_data:
206 blocks.append(self._buffer)
207 self._pos += len(self._buffer)
208 self._buffer = None
209 if return_data:
210 return b"".join(blocks)
211
212 # Read a block of up to n bytes.
213 # If return_data is false, consume the data without returning it.
214 def _read_block(self, n, return_data=True):
215 blocks = []
216 while n > 0 and self._fill_buffer():
217 if n < len(self._buffer):
218 data = self._buffer[:n]
219 self._buffer = self._buffer[n:]
220 else:
221 data = self._buffer
222 self._buffer = None
223 if return_data:
224 blocks.append(data)
225 self._pos += len(data)
226 n -= len(data)
227 if return_data:
228 return b"".join(blocks)
229
230 def peek(self, n=0):
231 """Return buffered data without advancing the file position.
232
233 Always returns at least one byte of data, unless at EOF.
234 The exact number of bytes returned is unspecified.
235 """
236 with self._lock:
237 self._check_can_read()
238 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
239 return b""
240 return self._buffer
241
242 def read(self, size=-1):
243 """Read up to size uncompressed bytes from the file.
244
245 If size is negative or omitted, read until EOF is reached.
246 Returns b'' if the file is already at EOF.
247 """
248 with self._lock:
249 self._check_can_read()
250 if self._mode == _MODE_READ_EOF or size == 0:
251 return b""
252 elif size < 0:
253 return self._read_all()
254 else:
255 return self._read_block(size)
256
257 def read1(self, size=-1):
258 """Read up to size uncompressed bytes with at most one read
259 from the underlying stream.
260
261 Returns b'' if the file is at EOF.
262 """
263 with self._lock:
264 self._check_can_read()
265 if (size == 0 or self._mode == _MODE_READ_EOF or
266 not self._fill_buffer()):
267 return b""
268 if 0 < size < len(self._buffer):
269 data = self._buffer[:size]
270 self._buffer = self._buffer[size:]
271 else:
272 data = self._buffer
273 self._buffer = None
274 self._pos += len(data)
275 return data
276
277 def readinto(self, b):
278 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200279
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200280 Returns the number of bytes read (0 for EOF).
281 """
282 with self._lock:
283 return io.BufferedIOBase.readinto(self, b)
284
285 def readline(self, size=-1):
286 """Read a line of uncompressed bytes from the file.
287
288 The terminating newline (if present) is retained. If size is
289 non-negative, no more than size bytes will be read (in which
290 case the line may be incomplete). Returns b'' if already at EOF.
291 """
292 if not hasattr(size, "__index__"):
293 raise TypeError("Integer argument expected")
294 size = size.__index__()
295 with self._lock:
296 return io.BufferedIOBase.readline(self, size)
297
298 def readlines(self, size=-1):
299 """Read a list of lines of uncompressed bytes from the file.
300
301 size can be specified to control the number of lines read: no
302 further lines will be read once the total size of the lines read
303 so far equals or exceeds size.
304 """
305 if not hasattr(size, "__index__"):
306 raise TypeError("Integer argument expected")
307 size = size.__index__()
308 with self._lock:
309 return io.BufferedIOBase.readlines(self, size)
310
311 def write(self, data):
312 """Write a byte string to the file.
313
314 Returns the number of uncompressed bytes written, which is
315 always len(data). Note that due to buffering, the file on disk
316 may not reflect the data written until close() is called.
317 """
318 with self._lock:
319 self._check_can_write()
320 compressed = self._compressor.compress(data)
321 self._fp.write(compressed)
322 self._pos += len(data)
323 return len(data)
324
325 def writelines(self, seq):
326 """Write a sequence of byte strings to the file.
327
328 Returns the number of uncompressed bytes written.
329 seq can be any iterable yielding byte strings.
330
331 Line separators are not added between the written byte strings.
332 """
333 with self._lock:
334 return io.BufferedIOBase.writelines(self, seq)
335
336 # Rewind the file to the beginning of the data stream.
337 def _rewind(self):
338 self._fp.seek(0, 0)
339 self._mode = _MODE_READ
340 self._pos = 0
341 self._decompressor = BZ2Decompressor()
342 self._buffer = None
343
344 def seek(self, offset, whence=0):
345 """Change the file position.
346
347 The new position is specified by offset, relative to the
348 position indicated by whence. Values for whence are:
349
350 0: start of stream (default); offset must not be negative
351 1: current stream position
352 2: end of stream; offset must not be positive
353
354 Returns the new file position.
355
356 Note that seeking is emulated, so depending on the parameters,
357 this operation may be extremely slow.
358 """
359 with self._lock:
360 self._check_can_seek()
361
362 # Recalculate offset as an absolute file position.
363 if whence == 0:
364 pass
365 elif whence == 1:
366 offset = self._pos + offset
367 elif whence == 2:
368 # Seeking relative to EOF - we need to know the file's size.
369 if self._size < 0:
370 self._read_all(return_data=False)
371 offset = self._size + offset
372 else:
373 raise ValueError("Invalid value for whence: {}".format(whence))
374
375 # Make it so that offset is the number of bytes to skip forward.
376 if offset < self._pos:
377 self._rewind()
378 else:
379 offset -= self._pos
380
381 # Read and discard data until we reach the desired position.
382 if self._mode != _MODE_READ_EOF:
383 self._read_block(offset, return_data=False)
384
385 return self._pos
386
387 def tell(self):
388 """Return the current file position."""
389 with self._lock:
390 self._check_not_closed()
391 return self._pos
392
393
394def compress(data, compresslevel=9):
395 """Compress a block of data.
396
397 compresslevel, if given, must be a number between 1 and 9.
398
399 For incremental compression, use a BZ2Compressor object instead.
400 """
401 comp = BZ2Compressor(compresslevel)
402 return comp.compress(data) + comp.flush()
403
404
405def decompress(data):
406 """Decompress a block of data.
407
408 For incremental decompression, use a BZ2Decompressor object instead.
409 """
410 if len(data) == 0:
411 return b""
Nadeem Vawda55b43382011-05-27 01:52:15 +0200412
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200413 results = []
Nadeem Vawda55b43382011-05-27 01:52:15 +0200414 while True:
415 decomp = BZ2Decompressor()
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200416 results.append(decomp.decompress(data))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200417 if not decomp.eof:
418 raise ValueError("Compressed data ended before the "
419 "end-of-stream marker was reached")
420 if not decomp.unused_data:
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200421 return b"".join(results)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200422 # There is unused data left over. Proceed to next stream.
423 data = decomp.unused_data