blob: 36e55584ed7b656b31300110d5f7a1c0157ec48c [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", "compress",
8 "decompress"]
9
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
12import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import warnings
14
Nadeem Vawda72750a82012-01-18 01:57:14 +020015try:
16 from threading import RLock
17except ImportError:
18 from dummy_threading import RLock
19
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020020from _bz2 import BZ2Compressor, BZ2Decompressor
21
22
23_MODE_CLOSED = 0
24_MODE_READ = 1
25_MODE_READ_EOF = 2
26_MODE_WRITE = 3
27
28_BUFFER_SIZE = 8192
29
30
31class BZ2File(io.BufferedIOBase):
32
33 """A file object providing transparent bzip2 (de)compression.
34
35 A BZ2File can act as a wrapper for an existing file object, or refer
36 directly to a named file on disk.
37
38 Note that BZ2File provides a *binary* file interface - data read is
39 returned as bytes, and data to be written should be given as bytes.
40 """
41
42 def __init__(self, filename=None, mode="r", buffering=None,
43 compresslevel=9, fileobj=None):
44 """Open a bzip2-compressed file.
45
46 If filename is given, open the named file. Otherwise, operate on
47 the file object given by fileobj. Exactly one of these two
48 parameters should be provided.
49
50 mode can be 'r' for reading (default), or 'w' for writing.
51
52 buffering is ignored. Its use is deprecated.
53
54 If mode is 'w', compresslevel can be a number between 1 and 9
55 specifying the level of compression: 1 produces the least
56 compression, and 9 (default) produces the most compression.
57 """
58 # This lock must be recursive, so that BufferedIOBase's
59 # readline(), readlines() and writelines() don't deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020060 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020061 self._fp = None
62 self._closefp = False
63 self._mode = _MODE_CLOSED
64 self._pos = 0
65 self._size = -1
66
67 if buffering is not None:
68 warnings.warn("Use of 'buffering' argument is deprecated",
69 DeprecationWarning)
70
71 if not (1 <= compresslevel <= 9):
72 raise ValueError("compresslevel must be between 1 and 9")
73
74 if mode in ("", "r", "rb"):
75 mode = "rb"
76 mode_code = _MODE_READ
77 self._decompressor = BZ2Decompressor()
78 self._buffer = None
79 elif mode in ("w", "wb"):
80 mode = "wb"
81 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020082 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020083 elif mode in ("a", "ab"):
84 mode = "ab"
85 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020086 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020087 else:
88 raise ValueError("Invalid mode: {!r}".format(mode))
89
90 if filename is not None and fileobj is None:
91 self._fp = open(filename, mode)
92 self._closefp = True
93 self._mode = mode_code
94 elif fileobj is not None and filename is None:
95 self._fp = fileobj
96 self._mode = mode_code
97 else:
98 raise ValueError("Must give exactly one of filename and fileobj")
99
100 def close(self):
101 """Flush and close the file.
102
103 May be called more than once without error. Once the file is
104 closed, any other operation on it will raise a ValueError.
105 """
106 with self._lock:
107 if self._mode == _MODE_CLOSED:
108 return
109 try:
110 if self._mode in (_MODE_READ, _MODE_READ_EOF):
111 self._decompressor = None
112 elif self._mode == _MODE_WRITE:
113 self._fp.write(self._compressor.flush())
114 self._compressor = None
115 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200116 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200117 if self._closefp:
118 self._fp.close()
119 finally:
120 self._fp = None
121 self._closefp = False
122 self._mode = _MODE_CLOSED
123 self._buffer = None
124
125 @property
126 def closed(self):
127 """True if this file is closed."""
128 return self._mode == _MODE_CLOSED
129
130 def fileno(self):
131 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200132 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200133 return self._fp.fileno()
134
135 def seekable(self):
136 """Return whether the file supports seeking."""
137 return self.readable()
138
139 def readable(self):
140 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200141 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200142 return self._mode in (_MODE_READ, _MODE_READ_EOF)
143
144 def writable(self):
145 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200146 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200147 return self._mode == _MODE_WRITE
148
149 # Mode-checking helper functions.
150
151 def _check_not_closed(self):
152 if self.closed:
153 raise ValueError("I/O operation on closed file")
154
155 def _check_can_read(self):
156 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200157 raise io.UnsupportedOperation("File not open for reading")
158
159 def _check_can_write(self):
160 if not self.writable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200161 raise io.UnsupportedOperation("File not open for writing")
162
163 def _check_can_seek(self):
164 if not self.seekable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200165 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200166 "on files open for reading")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200167
168 # Fill the readahead buffer if it is empty. Returns False on EOF.
169 def _fill_buffer(self):
170 if self._buffer:
171 return True
Nadeem Vawda55b43382011-05-27 01:52:15 +0200172
173 if self._decompressor.unused_data:
174 rawblock = self._decompressor.unused_data
175 else:
176 rawblock = self._fp.read(_BUFFER_SIZE)
177
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200178 if not rawblock:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200179 if self._decompressor.eof:
180 self._mode = _MODE_READ_EOF
181 self._size = self._pos
182 return False
183 else:
184 raise EOFError("Compressed file ended before the "
185 "end-of-stream marker was reached")
186
187 # Continue to next stream.
188 if self._decompressor.eof:
189 self._decompressor = BZ2Decompressor()
190
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200191 self._buffer = self._decompressor.decompress(rawblock)
192 return True
193
194 # Read data until EOF.
195 # If return_data is false, consume the data without returning it.
196 def _read_all(self, return_data=True):
197 blocks = []
198 while self._fill_buffer():
199 if return_data:
200 blocks.append(self._buffer)
201 self._pos += len(self._buffer)
202 self._buffer = None
203 if return_data:
204 return b"".join(blocks)
205
206 # Read a block of up to n bytes.
207 # If return_data is false, consume the data without returning it.
208 def _read_block(self, n, return_data=True):
209 blocks = []
210 while n > 0 and self._fill_buffer():
211 if n < len(self._buffer):
212 data = self._buffer[:n]
213 self._buffer = self._buffer[n:]
214 else:
215 data = self._buffer
216 self._buffer = None
217 if return_data:
218 blocks.append(data)
219 self._pos += len(data)
220 n -= len(data)
221 if return_data:
222 return b"".join(blocks)
223
224 def peek(self, n=0):
225 """Return buffered data without advancing the file position.
226
227 Always returns at least one byte of data, unless at EOF.
228 The exact number of bytes returned is unspecified.
229 """
230 with self._lock:
231 self._check_can_read()
232 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
233 return b""
234 return self._buffer
235
236 def read(self, size=-1):
237 """Read up to size uncompressed bytes from the file.
238
239 If size is negative or omitted, read until EOF is reached.
240 Returns b'' if the file is already at EOF.
241 """
242 with self._lock:
243 self._check_can_read()
244 if self._mode == _MODE_READ_EOF or size == 0:
245 return b""
246 elif size < 0:
247 return self._read_all()
248 else:
249 return self._read_block(size)
250
251 def read1(self, size=-1):
252 """Read up to size uncompressed bytes with at most one read
253 from the underlying stream.
254
255 Returns b'' if the file is at EOF.
256 """
257 with self._lock:
258 self._check_can_read()
259 if (size == 0 or self._mode == _MODE_READ_EOF or
260 not self._fill_buffer()):
261 return b""
262 if 0 < size < len(self._buffer):
263 data = self._buffer[:size]
264 self._buffer = self._buffer[size:]
265 else:
266 data = self._buffer
267 self._buffer = None
268 self._pos += len(data)
269 return data
270
271 def readinto(self, b):
272 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200273
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200274 Returns the number of bytes read (0 for EOF).
275 """
276 with self._lock:
277 return io.BufferedIOBase.readinto(self, b)
278
279 def readline(self, size=-1):
280 """Read a line of uncompressed bytes from the file.
281
282 The terminating newline (if present) is retained. If size is
283 non-negative, no more than size bytes will be read (in which
284 case the line may be incomplete). Returns b'' if already at EOF.
285 """
286 if not hasattr(size, "__index__"):
287 raise TypeError("Integer argument expected")
288 size = size.__index__()
289 with self._lock:
290 return io.BufferedIOBase.readline(self, size)
291
292 def readlines(self, size=-1):
293 """Read a list of lines of uncompressed bytes from the file.
294
295 size can be specified to control the number of lines read: no
296 further lines will be read once the total size of the lines read
297 so far equals or exceeds size.
298 """
299 if not hasattr(size, "__index__"):
300 raise TypeError("Integer argument expected")
301 size = size.__index__()
302 with self._lock:
303 return io.BufferedIOBase.readlines(self, size)
304
305 def write(self, data):
306 """Write a byte string to the file.
307
308 Returns the number of uncompressed bytes written, which is
309 always len(data). Note that due to buffering, the file on disk
310 may not reflect the data written until close() is called.
311 """
312 with self._lock:
313 self._check_can_write()
314 compressed = self._compressor.compress(data)
315 self._fp.write(compressed)
316 self._pos += len(data)
317 return len(data)
318
319 def writelines(self, seq):
320 """Write a sequence of byte strings to the file.
321
322 Returns the number of uncompressed bytes written.
323 seq can be any iterable yielding byte strings.
324
325 Line separators are not added between the written byte strings.
326 """
327 with self._lock:
328 return io.BufferedIOBase.writelines(self, seq)
329
330 # Rewind the file to the beginning of the data stream.
331 def _rewind(self):
332 self._fp.seek(0, 0)
333 self._mode = _MODE_READ
334 self._pos = 0
335 self._decompressor = BZ2Decompressor()
336 self._buffer = None
337
338 def seek(self, offset, whence=0):
339 """Change the file position.
340
341 The new position is specified by offset, relative to the
342 position indicated by whence. Values for whence are:
343
344 0: start of stream (default); offset must not be negative
345 1: current stream position
346 2: end of stream; offset must not be positive
347
348 Returns the new file position.
349
350 Note that seeking is emulated, so depending on the parameters,
351 this operation may be extremely slow.
352 """
353 with self._lock:
354 self._check_can_seek()
355
356 # Recalculate offset as an absolute file position.
357 if whence == 0:
358 pass
359 elif whence == 1:
360 offset = self._pos + offset
361 elif whence == 2:
362 # Seeking relative to EOF - we need to know the file's size.
363 if self._size < 0:
364 self._read_all(return_data=False)
365 offset = self._size + offset
366 else:
367 raise ValueError("Invalid value for whence: {}".format(whence))
368
369 # Make it so that offset is the number of bytes to skip forward.
370 if offset < self._pos:
371 self._rewind()
372 else:
373 offset -= self._pos
374
375 # Read and discard data until we reach the desired position.
376 if self._mode != _MODE_READ_EOF:
377 self._read_block(offset, return_data=False)
378
379 return self._pos
380
381 def tell(self):
382 """Return the current file position."""
383 with self._lock:
384 self._check_not_closed()
385 return self._pos
386
387
388def compress(data, compresslevel=9):
389 """Compress a block of data.
390
391 compresslevel, if given, must be a number between 1 and 9.
392
393 For incremental compression, use a BZ2Compressor object instead.
394 """
395 comp = BZ2Compressor(compresslevel)
396 return comp.compress(data) + comp.flush()
397
398
399def decompress(data):
400 """Decompress a block of data.
401
402 For incremental decompression, use a BZ2Decompressor object instead.
403 """
404 if len(data) == 0:
405 return b""
Nadeem Vawda55b43382011-05-27 01:52:15 +0200406
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200407 results = []
Nadeem Vawda55b43382011-05-27 01:52:15 +0200408 while True:
409 decomp = BZ2Decompressor()
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200410 results.append(decomp.decompress(data))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200411 if not decomp.eof:
412 raise ValueError("Compressed data ended before the "
413 "end-of-stream marker was reached")
414 if not decomp.unused_data:
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200415 return b"".join(results)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200416 # There is unused data left over. Proceed to next stream.
417 data = decomp.unused_data