blob: 5c228f95e5a7665479ce974866755ea5c364c0a8 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", "compress",
8 "decompress"]
9
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
12import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import warnings
14
Nadeem Vawda72750a82012-01-18 01:57:14 +020015try:
16 from threading import RLock
17except ImportError:
18 from dummy_threading import RLock
19
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020020from _bz2 import BZ2Compressor, BZ2Decompressor
21
22
23_MODE_CLOSED = 0
24_MODE_READ = 1
25_MODE_READ_EOF = 2
26_MODE_WRITE = 3
27
28_BUFFER_SIZE = 8192
29
30
31class BZ2File(io.BufferedIOBase):
32
33 """A file object providing transparent bzip2 (de)compression.
34
35 A BZ2File can act as a wrapper for an existing file object, or refer
36 directly to a named file on disk.
37
38 Note that BZ2File provides a *binary* file interface - data read is
39 returned as bytes, and data to be written should be given as bytes.
40 """
41
42 def __init__(self, filename=None, mode="r", buffering=None,
43 compresslevel=9, fileobj=None):
44 """Open a bzip2-compressed file.
45
46 If filename is given, open the named file. Otherwise, operate on
47 the file object given by fileobj. Exactly one of these two
48 parameters should be provided.
49
Nadeem Vawdacac89092012-02-04 13:08:11 +020050 mode can be 'r' for reading (default), 'w' for (over)writing, or
51 'a' for appending.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020052
53 buffering is ignored. Its use is deprecated.
54
Nadeem Vawdacac89092012-02-04 13:08:11 +020055 If mode is 'w' or 'a', compresslevel can be a number between 1
56 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020057 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020058
59 If mode is 'r', the input file may be the concatenation of
60 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020061 """
62 # This lock must be recursive, so that BufferedIOBase's
63 # readline(), readlines() and writelines() don't deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020064 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 self._fp = None
66 self._closefp = False
67 self._mode = _MODE_CLOSED
68 self._pos = 0
69 self._size = -1
70
71 if buffering is not None:
72 warnings.warn("Use of 'buffering' argument is deprecated",
73 DeprecationWarning)
74
75 if not (1 <= compresslevel <= 9):
76 raise ValueError("compresslevel must be between 1 and 9")
77
78 if mode in ("", "r", "rb"):
79 mode = "rb"
80 mode_code = _MODE_READ
81 self._decompressor = BZ2Decompressor()
82 self._buffer = None
83 elif mode in ("w", "wb"):
84 mode = "wb"
85 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020086 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020087 elif mode in ("a", "ab"):
88 mode = "ab"
89 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020090 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020091 else:
92 raise ValueError("Invalid mode: {!r}".format(mode))
93
94 if filename is not None and fileobj is None:
95 self._fp = open(filename, mode)
96 self._closefp = True
97 self._mode = mode_code
98 elif fileobj is not None and filename is None:
99 self._fp = fileobj
100 self._mode = mode_code
101 else:
102 raise ValueError("Must give exactly one of filename and fileobj")
103
104 def close(self):
105 """Flush and close the file.
106
107 May be called more than once without error. Once the file is
108 closed, any other operation on it will raise a ValueError.
109 """
110 with self._lock:
111 if self._mode == _MODE_CLOSED:
112 return
113 try:
114 if self._mode in (_MODE_READ, _MODE_READ_EOF):
115 self._decompressor = None
116 elif self._mode == _MODE_WRITE:
117 self._fp.write(self._compressor.flush())
118 self._compressor = None
119 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200120 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200121 if self._closefp:
122 self._fp.close()
123 finally:
124 self._fp = None
125 self._closefp = False
126 self._mode = _MODE_CLOSED
127 self._buffer = None
128
129 @property
130 def closed(self):
131 """True if this file is closed."""
132 return self._mode == _MODE_CLOSED
133
134 def fileno(self):
135 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200136 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200137 return self._fp.fileno()
138
139 def seekable(self):
140 """Return whether the file supports seeking."""
141 return self.readable()
142
143 def readable(self):
144 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200145 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200146 return self._mode in (_MODE_READ, _MODE_READ_EOF)
147
148 def writable(self):
149 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200150 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200151 return self._mode == _MODE_WRITE
152
153 # Mode-checking helper functions.
154
155 def _check_not_closed(self):
156 if self.closed:
157 raise ValueError("I/O operation on closed file")
158
159 def _check_can_read(self):
160 if not self.readable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200161 raise io.UnsupportedOperation("File not open for reading")
162
163 def _check_can_write(self):
164 if not self.writable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200165 raise io.UnsupportedOperation("File not open for writing")
166
167 def _check_can_seek(self):
168 if not self.seekable():
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200169 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200170 "on files open for reading")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200171
172 # Fill the readahead buffer if it is empty. Returns False on EOF.
173 def _fill_buffer(self):
174 if self._buffer:
175 return True
Nadeem Vawda55b43382011-05-27 01:52:15 +0200176
177 if self._decompressor.unused_data:
178 rawblock = self._decompressor.unused_data
179 else:
180 rawblock = self._fp.read(_BUFFER_SIZE)
181
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200182 if not rawblock:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200183 if self._decompressor.eof:
184 self._mode = _MODE_READ_EOF
185 self._size = self._pos
186 return False
187 else:
188 raise EOFError("Compressed file ended before the "
189 "end-of-stream marker was reached")
190
191 # Continue to next stream.
192 if self._decompressor.eof:
193 self._decompressor = BZ2Decompressor()
194
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200195 self._buffer = self._decompressor.decompress(rawblock)
196 return True
197
198 # Read data until EOF.
199 # If return_data is false, consume the data without returning it.
200 def _read_all(self, return_data=True):
201 blocks = []
202 while self._fill_buffer():
203 if return_data:
204 blocks.append(self._buffer)
205 self._pos += len(self._buffer)
206 self._buffer = None
207 if return_data:
208 return b"".join(blocks)
209
210 # Read a block of up to n bytes.
211 # If return_data is false, consume the data without returning it.
212 def _read_block(self, n, return_data=True):
213 blocks = []
214 while n > 0 and self._fill_buffer():
215 if n < len(self._buffer):
216 data = self._buffer[:n]
217 self._buffer = self._buffer[n:]
218 else:
219 data = self._buffer
220 self._buffer = None
221 if return_data:
222 blocks.append(data)
223 self._pos += len(data)
224 n -= len(data)
225 if return_data:
226 return b"".join(blocks)
227
228 def peek(self, n=0):
229 """Return buffered data without advancing the file position.
230
231 Always returns at least one byte of data, unless at EOF.
232 The exact number of bytes returned is unspecified.
233 """
234 with self._lock:
235 self._check_can_read()
236 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
237 return b""
238 return self._buffer
239
240 def read(self, size=-1):
241 """Read up to size uncompressed bytes from the file.
242
243 If size is negative or omitted, read until EOF is reached.
244 Returns b'' if the file is already at EOF.
245 """
246 with self._lock:
247 self._check_can_read()
248 if self._mode == _MODE_READ_EOF or size == 0:
249 return b""
250 elif size < 0:
251 return self._read_all()
252 else:
253 return self._read_block(size)
254
255 def read1(self, size=-1):
256 """Read up to size uncompressed bytes with at most one read
257 from the underlying stream.
258
259 Returns b'' if the file is at EOF.
260 """
261 with self._lock:
262 self._check_can_read()
263 if (size == 0 or self._mode == _MODE_READ_EOF or
264 not self._fill_buffer()):
265 return b""
266 if 0 < size < len(self._buffer):
267 data = self._buffer[:size]
268 self._buffer = self._buffer[size:]
269 else:
270 data = self._buffer
271 self._buffer = None
272 self._pos += len(data)
273 return data
274
275 def readinto(self, b):
276 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200277
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200278 Returns the number of bytes read (0 for EOF).
279 """
280 with self._lock:
281 return io.BufferedIOBase.readinto(self, b)
282
283 def readline(self, size=-1):
284 """Read a line of uncompressed bytes from the file.
285
286 The terminating newline (if present) is retained. If size is
287 non-negative, no more than size bytes will be read (in which
288 case the line may be incomplete). Returns b'' if already at EOF.
289 """
290 if not hasattr(size, "__index__"):
291 raise TypeError("Integer argument expected")
292 size = size.__index__()
293 with self._lock:
294 return io.BufferedIOBase.readline(self, size)
295
296 def readlines(self, size=-1):
297 """Read a list of lines of uncompressed bytes from the file.
298
299 size can be specified to control the number of lines read: no
300 further lines will be read once the total size of the lines read
301 so far equals or exceeds size.
302 """
303 if not hasattr(size, "__index__"):
304 raise TypeError("Integer argument expected")
305 size = size.__index__()
306 with self._lock:
307 return io.BufferedIOBase.readlines(self, size)
308
309 def write(self, data):
310 """Write a byte string to the file.
311
312 Returns the number of uncompressed bytes written, which is
313 always len(data). Note that due to buffering, the file on disk
314 may not reflect the data written until close() is called.
315 """
316 with self._lock:
317 self._check_can_write()
318 compressed = self._compressor.compress(data)
319 self._fp.write(compressed)
320 self._pos += len(data)
321 return len(data)
322
323 def writelines(self, seq):
324 """Write a sequence of byte strings to the file.
325
326 Returns the number of uncompressed bytes written.
327 seq can be any iterable yielding byte strings.
328
329 Line separators are not added between the written byte strings.
330 """
331 with self._lock:
332 return io.BufferedIOBase.writelines(self, seq)
333
334 # Rewind the file to the beginning of the data stream.
335 def _rewind(self):
336 self._fp.seek(0, 0)
337 self._mode = _MODE_READ
338 self._pos = 0
339 self._decompressor = BZ2Decompressor()
340 self._buffer = None
341
342 def seek(self, offset, whence=0):
343 """Change the file position.
344
345 The new position is specified by offset, relative to the
346 position indicated by whence. Values for whence are:
347
348 0: start of stream (default); offset must not be negative
349 1: current stream position
350 2: end of stream; offset must not be positive
351
352 Returns the new file position.
353
354 Note that seeking is emulated, so depending on the parameters,
355 this operation may be extremely slow.
356 """
357 with self._lock:
358 self._check_can_seek()
359
360 # Recalculate offset as an absolute file position.
361 if whence == 0:
362 pass
363 elif whence == 1:
364 offset = self._pos + offset
365 elif whence == 2:
366 # Seeking relative to EOF - we need to know the file's size.
367 if self._size < 0:
368 self._read_all(return_data=False)
369 offset = self._size + offset
370 else:
371 raise ValueError("Invalid value for whence: {}".format(whence))
372
373 # Make it so that offset is the number of bytes to skip forward.
374 if offset < self._pos:
375 self._rewind()
376 else:
377 offset -= self._pos
378
379 # Read and discard data until we reach the desired position.
380 if self._mode != _MODE_READ_EOF:
381 self._read_block(offset, return_data=False)
382
383 return self._pos
384
385 def tell(self):
386 """Return the current file position."""
387 with self._lock:
388 self._check_not_closed()
389 return self._pos
390
391
392def compress(data, compresslevel=9):
393 """Compress a block of data.
394
395 compresslevel, if given, must be a number between 1 and 9.
396
397 For incremental compression, use a BZ2Compressor object instead.
398 """
399 comp = BZ2Compressor(compresslevel)
400 return comp.compress(data) + comp.flush()
401
402
403def decompress(data):
404 """Decompress a block of data.
405
406 For incremental decompression, use a BZ2Decompressor object instead.
407 """
408 if len(data) == 0:
409 return b""
Nadeem Vawda55b43382011-05-27 01:52:15 +0200410
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200411 results = []
Nadeem Vawda55b43382011-05-27 01:52:15 +0200412 while True:
413 decomp = BZ2Decompressor()
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200414 results.append(decomp.decompress(data))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200415 if not decomp.eof:
416 raise ValueError("Compressed data ended before the "
417 "end-of-stream marker was reached")
418 if not decomp.unused_data:
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200419 return b"".join(results)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200420 # There is unused data left over. Proceed to next stream.
421 data = decomp.unused_data