blob: cc71ae025aec03d05f2ed27deff0694ade4490e7 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", "compress",
8 "decompress"]
9
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
12import io
13import threading
14import warnings
15
16from _bz2 import BZ2Compressor, BZ2Decompressor
17
18
19_MODE_CLOSED = 0
20_MODE_READ = 1
21_MODE_READ_EOF = 2
22_MODE_WRITE = 3
23
24_BUFFER_SIZE = 8192
25
26
27class BZ2File(io.BufferedIOBase):
28
29 """A file object providing transparent bzip2 (de)compression.
30
31 A BZ2File can act as a wrapper for an existing file object, or refer
32 directly to a named file on disk.
33
34 Note that BZ2File provides a *binary* file interface - data read is
35 returned as bytes, and data to be written should be given as bytes.
36 """
37
38 def __init__(self, filename=None, mode="r", buffering=None,
39 compresslevel=9, fileobj=None):
40 """Open a bzip2-compressed file.
41
42 If filename is given, open the named file. Otherwise, operate on
43 the file object given by fileobj. Exactly one of these two
44 parameters should be provided.
45
46 mode can be 'r' for reading (default), or 'w' for writing.
47
48 buffering is ignored. Its use is deprecated.
49
50 If mode is 'w', compresslevel can be a number between 1 and 9
51 specifying the level of compression: 1 produces the least
52 compression, and 9 (default) produces the most compression.
53 """
54 # This lock must be recursive, so that BufferedIOBase's
55 # readline(), readlines() and writelines() don't deadlock.
56 self._lock = threading.RLock()
57 self._fp = None
58 self._closefp = False
59 self._mode = _MODE_CLOSED
60 self._pos = 0
61 self._size = -1
62
63 if buffering is not None:
64 warnings.warn("Use of 'buffering' argument is deprecated",
65 DeprecationWarning)
66
67 if not (1 <= compresslevel <= 9):
68 raise ValueError("compresslevel must be between 1 and 9")
69
70 if mode in ("", "r", "rb"):
71 mode = "rb"
72 mode_code = _MODE_READ
73 self._decompressor = BZ2Decompressor()
74 self._buffer = None
75 elif mode in ("w", "wb"):
76 mode = "wb"
77 mode_code = _MODE_WRITE
78 self._compressor = BZ2Compressor()
Nadeem Vawda55b43382011-05-27 01:52:15 +020079 elif mode in ("a", "ab"):
80 mode = "ab"
81 mode_code = _MODE_WRITE
82 self._compressor = BZ2Compressor()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020083 else:
84 raise ValueError("Invalid mode: {!r}".format(mode))
85
86 if filename is not None and fileobj is None:
87 self._fp = open(filename, mode)
88 self._closefp = True
89 self._mode = mode_code
90 elif fileobj is not None and filename is None:
91 self._fp = fileobj
92 self._mode = mode_code
93 else:
94 raise ValueError("Must give exactly one of filename and fileobj")
95
96 def close(self):
97 """Flush and close the file.
98
99 May be called more than once without error. Once the file is
100 closed, any other operation on it will raise a ValueError.
101 """
102 with self._lock:
103 if self._mode == _MODE_CLOSED:
104 return
105 try:
106 if self._mode in (_MODE_READ, _MODE_READ_EOF):
107 self._decompressor = None
108 elif self._mode == _MODE_WRITE:
109 self._fp.write(self._compressor.flush())
110 self._compressor = None
111 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200112 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200113 if self._closefp:
114 self._fp.close()
115 finally:
116 self._fp = None
117 self._closefp = False
118 self._mode = _MODE_CLOSED
119 self._buffer = None
120
121 @property
122 def closed(self):
123 """True if this file is closed."""
124 return self._mode == _MODE_CLOSED
125
126 def fileno(self):
127 """Return the file descriptor for the underlying file."""
128 return self._fp.fileno()
129
130 def seekable(self):
131 """Return whether the file supports seeking."""
132 return self.readable()
133
134 def readable(self):
135 """Return whether the file was opened for reading."""
136 return self._mode in (_MODE_READ, _MODE_READ_EOF)
137
138 def writable(self):
139 """Return whether the file was opened for writing."""
140 return self._mode == _MODE_WRITE
141
142 # Mode-checking helper functions.
143
144 def _check_not_closed(self):
145 if self.closed:
146 raise ValueError("I/O operation on closed file")
147
148 def _check_can_read(self):
149 if not self.readable():
150 self._check_not_closed()
151 raise io.UnsupportedOperation("File not open for reading")
152
153 def _check_can_write(self):
154 if not self.writable():
155 self._check_not_closed()
156 raise io.UnsupportedOperation("File not open for writing")
157
158 def _check_can_seek(self):
159 if not self.seekable():
160 self._check_not_closed()
161 raise io.UnsupportedOperation("Seeking is only supported "
Nadeem Vawdaf1a1af22011-05-25 00:32:08 +0200162 "on files open for reading")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200163
164 # Fill the readahead buffer if it is empty. Returns False on EOF.
165 def _fill_buffer(self):
166 if self._buffer:
167 return True
Nadeem Vawda55b43382011-05-27 01:52:15 +0200168
169 if self._decompressor.unused_data:
170 rawblock = self._decompressor.unused_data
171 else:
172 rawblock = self._fp.read(_BUFFER_SIZE)
173
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200174 if not rawblock:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200175 if self._decompressor.eof:
176 self._mode = _MODE_READ_EOF
177 self._size = self._pos
178 return False
179 else:
180 raise EOFError("Compressed file ended before the "
181 "end-of-stream marker was reached")
182
183 # Continue to next stream.
184 if self._decompressor.eof:
185 self._decompressor = BZ2Decompressor()
186
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200187 self._buffer = self._decompressor.decompress(rawblock)
188 return True
189
190 # Read data until EOF.
191 # If return_data is false, consume the data without returning it.
192 def _read_all(self, return_data=True):
193 blocks = []
194 while self._fill_buffer():
195 if return_data:
196 blocks.append(self._buffer)
197 self._pos += len(self._buffer)
198 self._buffer = None
199 if return_data:
200 return b"".join(blocks)
201
202 # Read a block of up to n bytes.
203 # If return_data is false, consume the data without returning it.
204 def _read_block(self, n, return_data=True):
205 blocks = []
206 while n > 0 and self._fill_buffer():
207 if n < len(self._buffer):
208 data = self._buffer[:n]
209 self._buffer = self._buffer[n:]
210 else:
211 data = self._buffer
212 self._buffer = None
213 if return_data:
214 blocks.append(data)
215 self._pos += len(data)
216 n -= len(data)
217 if return_data:
218 return b"".join(blocks)
219
220 def peek(self, n=0):
221 """Return buffered data without advancing the file position.
222
223 Always returns at least one byte of data, unless at EOF.
224 The exact number of bytes returned is unspecified.
225 """
226 with self._lock:
227 self._check_can_read()
228 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
229 return b""
230 return self._buffer
231
232 def read(self, size=-1):
233 """Read up to size uncompressed bytes from the file.
234
235 If size is negative or omitted, read until EOF is reached.
236 Returns b'' if the file is already at EOF.
237 """
238 with self._lock:
239 self._check_can_read()
240 if self._mode == _MODE_READ_EOF or size == 0:
241 return b""
242 elif size < 0:
243 return self._read_all()
244 else:
245 return self._read_block(size)
246
247 def read1(self, size=-1):
248 """Read up to size uncompressed bytes with at most one read
249 from the underlying stream.
250
251 Returns b'' if the file is at EOF.
252 """
253 with self._lock:
254 self._check_can_read()
255 if (size == 0 or self._mode == _MODE_READ_EOF or
256 not self._fill_buffer()):
257 return b""
258 if 0 < size < len(self._buffer):
259 data = self._buffer[:size]
260 self._buffer = self._buffer[size:]
261 else:
262 data = self._buffer
263 self._buffer = None
264 self._pos += len(data)
265 return data
266
267 def readinto(self, b):
268 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200269
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200270 Returns the number of bytes read (0 for EOF).
271 """
272 with self._lock:
273 return io.BufferedIOBase.readinto(self, b)
274
275 def readline(self, size=-1):
276 """Read a line of uncompressed bytes from the file.
277
278 The terminating newline (if present) is retained. If size is
279 non-negative, no more than size bytes will be read (in which
280 case the line may be incomplete). Returns b'' if already at EOF.
281 """
282 if not hasattr(size, "__index__"):
283 raise TypeError("Integer argument expected")
284 size = size.__index__()
285 with self._lock:
286 return io.BufferedIOBase.readline(self, size)
287
288 def readlines(self, size=-1):
289 """Read a list of lines of uncompressed bytes from the file.
290
291 size can be specified to control the number of lines read: no
292 further lines will be read once the total size of the lines read
293 so far equals or exceeds size.
294 """
295 if not hasattr(size, "__index__"):
296 raise TypeError("Integer argument expected")
297 size = size.__index__()
298 with self._lock:
299 return io.BufferedIOBase.readlines(self, size)
300
301 def write(self, data):
302 """Write a byte string to the file.
303
304 Returns the number of uncompressed bytes written, which is
305 always len(data). Note that due to buffering, the file on disk
306 may not reflect the data written until close() is called.
307 """
308 with self._lock:
309 self._check_can_write()
310 compressed = self._compressor.compress(data)
311 self._fp.write(compressed)
312 self._pos += len(data)
313 return len(data)
314
315 def writelines(self, seq):
316 """Write a sequence of byte strings to the file.
317
318 Returns the number of uncompressed bytes written.
319 seq can be any iterable yielding byte strings.
320
321 Line separators are not added between the written byte strings.
322 """
323 with self._lock:
324 return io.BufferedIOBase.writelines(self, seq)
325
326 # Rewind the file to the beginning of the data stream.
327 def _rewind(self):
328 self._fp.seek(0, 0)
329 self._mode = _MODE_READ
330 self._pos = 0
331 self._decompressor = BZ2Decompressor()
332 self._buffer = None
333
334 def seek(self, offset, whence=0):
335 """Change the file position.
336
337 The new position is specified by offset, relative to the
338 position indicated by whence. Values for whence are:
339
340 0: start of stream (default); offset must not be negative
341 1: current stream position
342 2: end of stream; offset must not be positive
343
344 Returns the new file position.
345
346 Note that seeking is emulated, so depending on the parameters,
347 this operation may be extremely slow.
348 """
349 with self._lock:
350 self._check_can_seek()
351
352 # Recalculate offset as an absolute file position.
353 if whence == 0:
354 pass
355 elif whence == 1:
356 offset = self._pos + offset
357 elif whence == 2:
358 # Seeking relative to EOF - we need to know the file's size.
359 if self._size < 0:
360 self._read_all(return_data=False)
361 offset = self._size + offset
362 else:
363 raise ValueError("Invalid value for whence: {}".format(whence))
364
365 # Make it so that offset is the number of bytes to skip forward.
366 if offset < self._pos:
367 self._rewind()
368 else:
369 offset -= self._pos
370
371 # Read and discard data until we reach the desired position.
372 if self._mode != _MODE_READ_EOF:
373 self._read_block(offset, return_data=False)
374
375 return self._pos
376
377 def tell(self):
378 """Return the current file position."""
379 with self._lock:
380 self._check_not_closed()
381 return self._pos
382
383
384def compress(data, compresslevel=9):
385 """Compress a block of data.
386
387 compresslevel, if given, must be a number between 1 and 9.
388
389 For incremental compression, use a BZ2Compressor object instead.
390 """
391 comp = BZ2Compressor(compresslevel)
392 return comp.compress(data) + comp.flush()
393
394
395def decompress(data):
396 """Decompress a block of data.
397
398 For incremental decompression, use a BZ2Decompressor object instead.
399 """
400 if len(data) == 0:
401 return b""
Nadeem Vawda55b43382011-05-27 01:52:15 +0200402
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200403 results = []
Nadeem Vawda55b43382011-05-27 01:52:15 +0200404 while True:
405 decomp = BZ2Decompressor()
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200406 results.append(decomp.decompress(data))
Nadeem Vawda55b43382011-05-27 01:52:15 +0200407 if not decomp.eof:
408 raise ValueError("Compressed data ended before the "
409 "end-of-stream marker was reached")
410 if not decomp.unused_data:
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200411 return b"".join(results)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200412 # There is unused data left over. Proceed to next stream.
413 data = decomp.unused_data