blob: 9506b4ad3b7c8074fa60d43009119448948e188d [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", "compress",
8 "decompress"]
9
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
12import io
13import threading
14import warnings
15
16from _bz2 import BZ2Compressor, BZ2Decompressor
17
18
19_MODE_CLOSED = 0
20_MODE_READ = 1
21_MODE_READ_EOF = 2
22_MODE_WRITE = 3
23
24_BUFFER_SIZE = 8192
25
26
27class BZ2File(io.BufferedIOBase):
28
29 """A file object providing transparent bzip2 (de)compression.
30
31 A BZ2File can act as a wrapper for an existing file object, or refer
32 directly to a named file on disk.
33
34 Note that BZ2File provides a *binary* file interface - data read is
35 returned as bytes, and data to be written should be given as bytes.
36 """
37
38 def __init__(self, filename=None, mode="r", buffering=None,
39 compresslevel=9, fileobj=None):
40 """Open a bzip2-compressed file.
41
42 If filename is given, open the named file. Otherwise, operate on
43 the file object given by fileobj. Exactly one of these two
44 parameters should be provided.
45
46 mode can be 'r' for reading (default), or 'w' for writing.
47
48 buffering is ignored. Its use is deprecated.
49
50 If mode is 'w', compresslevel can be a number between 1 and 9
51 specifying the level of compression: 1 produces the least
52 compression, and 9 (default) produces the most compression.
53 """
54 # This lock must be recursive, so that BufferedIOBase's
55 # readline(), readlines() and writelines() don't deadlock.
56 self._lock = threading.RLock()
57 self._fp = None
58 self._closefp = False
59 self._mode = _MODE_CLOSED
60 self._pos = 0
61 self._size = -1
62
63 if buffering is not None:
64 warnings.warn("Use of 'buffering' argument is deprecated",
65 DeprecationWarning)
66
67 if not (1 <= compresslevel <= 9):
68 raise ValueError("compresslevel must be between 1 and 9")
69
70 if mode in ("", "r", "rb"):
71 mode = "rb"
72 mode_code = _MODE_READ
73 self._decompressor = BZ2Decompressor()
74 self._buffer = None
75 elif mode in ("w", "wb"):
76 mode = "wb"
77 mode_code = _MODE_WRITE
78 self._compressor = BZ2Compressor()
79 else:
80 raise ValueError("Invalid mode: {!r}".format(mode))
81
82 if filename is not None and fileobj is None:
83 self._fp = open(filename, mode)
84 self._closefp = True
85 self._mode = mode_code
86 elif fileobj is not None and filename is None:
87 self._fp = fileobj
88 self._mode = mode_code
89 else:
90 raise ValueError("Must give exactly one of filename and fileobj")
91
92 def close(self):
93 """Flush and close the file.
94
95 May be called more than once without error. Once the file is
96 closed, any other operation on it will raise a ValueError.
97 """
98 with self._lock:
99 if self._mode == _MODE_CLOSED:
100 return
101 try:
102 if self._mode in (_MODE_READ, _MODE_READ_EOF):
103 self._decompressor = None
104 elif self._mode == _MODE_WRITE:
105 self._fp.write(self._compressor.flush())
106 self._compressor = None
107 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200108 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200109 if self._closefp:
110 self._fp.close()
111 finally:
112 self._fp = None
113 self._closefp = False
114 self._mode = _MODE_CLOSED
115 self._buffer = None
116
117 @property
118 def closed(self):
119 """True if this file is closed."""
120 return self._mode == _MODE_CLOSED
121
122 def fileno(self):
123 """Return the file descriptor for the underlying file."""
124 return self._fp.fileno()
125
126 def seekable(self):
127 """Return whether the file supports seeking."""
128 return self.readable()
129
130 def readable(self):
131 """Return whether the file was opened for reading."""
132 return self._mode in (_MODE_READ, _MODE_READ_EOF)
133
134 def writable(self):
135 """Return whether the file was opened for writing."""
136 return self._mode == _MODE_WRITE
137
138 # Mode-checking helper functions.
139
140 def _check_not_closed(self):
141 if self.closed:
142 raise ValueError("I/O operation on closed file")
143
144 def _check_can_read(self):
145 if not self.readable():
146 self._check_not_closed()
147 raise io.UnsupportedOperation("File not open for reading")
148
149 def _check_can_write(self):
150 if not self.writable():
151 self._check_not_closed()
152 raise io.UnsupportedOperation("File not open for writing")
153
154 def _check_can_seek(self):
155 if not self.seekable():
156 self._check_not_closed()
157 raise io.UnsupportedOperation("Seeking is only supported "
158 "on files opening for reading")
159
160 # Fill the readahead buffer if it is empty. Returns False on EOF.
161 def _fill_buffer(self):
162 if self._buffer:
163 return True
164 if self._decompressor.eof:
165 self._mode = _MODE_READ_EOF
166 self._size = self._pos
167 return False
168 rawblock = self._fp.read(_BUFFER_SIZE)
169 if not rawblock:
170 raise EOFError("Compressed file ended before the "
171 "end-of-stream marker was reached")
172 self._buffer = self._decompressor.decompress(rawblock)
173 return True
174
175 # Read data until EOF.
176 # If return_data is false, consume the data without returning it.
177 def _read_all(self, return_data=True):
178 blocks = []
179 while self._fill_buffer():
180 if return_data:
181 blocks.append(self._buffer)
182 self._pos += len(self._buffer)
183 self._buffer = None
184 if return_data:
185 return b"".join(blocks)
186
187 # Read a block of up to n bytes.
188 # If return_data is false, consume the data without returning it.
189 def _read_block(self, n, return_data=True):
190 blocks = []
191 while n > 0 and self._fill_buffer():
192 if n < len(self._buffer):
193 data = self._buffer[:n]
194 self._buffer = self._buffer[n:]
195 else:
196 data = self._buffer
197 self._buffer = None
198 if return_data:
199 blocks.append(data)
200 self._pos += len(data)
201 n -= len(data)
202 if return_data:
203 return b"".join(blocks)
204
205 def peek(self, n=0):
206 """Return buffered data without advancing the file position.
207
208 Always returns at least one byte of data, unless at EOF.
209 The exact number of bytes returned is unspecified.
210 """
211 with self._lock:
212 self._check_can_read()
213 if self._mode == _MODE_READ_EOF or not self._fill_buffer():
214 return b""
215 return self._buffer
216
217 def read(self, size=-1):
218 """Read up to size uncompressed bytes from the file.
219
220 If size is negative or omitted, read until EOF is reached.
221 Returns b'' if the file is already at EOF.
222 """
223 with self._lock:
224 self._check_can_read()
225 if self._mode == _MODE_READ_EOF or size == 0:
226 return b""
227 elif size < 0:
228 return self._read_all()
229 else:
230 return self._read_block(size)
231
232 def read1(self, size=-1):
233 """Read up to size uncompressed bytes with at most one read
234 from the underlying stream.
235
236 Returns b'' if the file is at EOF.
237 """
238 with self._lock:
239 self._check_can_read()
240 if (size == 0 or self._mode == _MODE_READ_EOF or
241 not self._fill_buffer()):
242 return b""
243 if 0 < size < len(self._buffer):
244 data = self._buffer[:size]
245 self._buffer = self._buffer[size:]
246 else:
247 data = self._buffer
248 self._buffer = None
249 self._pos += len(data)
250 return data
251
252 def readinto(self, b):
253 """Read up to len(b) bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200254
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200255 Returns the number of bytes read (0 for EOF).
256 """
257 with self._lock:
258 return io.BufferedIOBase.readinto(self, b)
259
260 def readline(self, size=-1):
261 """Read a line of uncompressed bytes from the file.
262
263 The terminating newline (if present) is retained. If size is
264 non-negative, no more than size bytes will be read (in which
265 case the line may be incomplete). Returns b'' if already at EOF.
266 """
267 if not hasattr(size, "__index__"):
268 raise TypeError("Integer argument expected")
269 size = size.__index__()
270 with self._lock:
271 return io.BufferedIOBase.readline(self, size)
272
273 def readlines(self, size=-1):
274 """Read a list of lines of uncompressed bytes from the file.
275
276 size can be specified to control the number of lines read: no
277 further lines will be read once the total size of the lines read
278 so far equals or exceeds size.
279 """
280 if not hasattr(size, "__index__"):
281 raise TypeError("Integer argument expected")
282 size = size.__index__()
283 with self._lock:
284 return io.BufferedIOBase.readlines(self, size)
285
286 def write(self, data):
287 """Write a byte string to the file.
288
289 Returns the number of uncompressed bytes written, which is
290 always len(data). Note that due to buffering, the file on disk
291 may not reflect the data written until close() is called.
292 """
293 with self._lock:
294 self._check_can_write()
295 compressed = self._compressor.compress(data)
296 self._fp.write(compressed)
297 self._pos += len(data)
298 return len(data)
299
300 def writelines(self, seq):
301 """Write a sequence of byte strings to the file.
302
303 Returns the number of uncompressed bytes written.
304 seq can be any iterable yielding byte strings.
305
306 Line separators are not added between the written byte strings.
307 """
308 with self._lock:
309 return io.BufferedIOBase.writelines(self, seq)
310
311 # Rewind the file to the beginning of the data stream.
312 def _rewind(self):
313 self._fp.seek(0, 0)
314 self._mode = _MODE_READ
315 self._pos = 0
316 self._decompressor = BZ2Decompressor()
317 self._buffer = None
318
319 def seek(self, offset, whence=0):
320 """Change the file position.
321
322 The new position is specified by offset, relative to the
323 position indicated by whence. Values for whence are:
324
325 0: start of stream (default); offset must not be negative
326 1: current stream position
327 2: end of stream; offset must not be positive
328
329 Returns the new file position.
330
331 Note that seeking is emulated, so depending on the parameters,
332 this operation may be extremely slow.
333 """
334 with self._lock:
335 self._check_can_seek()
336
337 # Recalculate offset as an absolute file position.
338 if whence == 0:
339 pass
340 elif whence == 1:
341 offset = self._pos + offset
342 elif whence == 2:
343 # Seeking relative to EOF - we need to know the file's size.
344 if self._size < 0:
345 self._read_all(return_data=False)
346 offset = self._size + offset
347 else:
348 raise ValueError("Invalid value for whence: {}".format(whence))
349
350 # Make it so that offset is the number of bytes to skip forward.
351 if offset < self._pos:
352 self._rewind()
353 else:
354 offset -= self._pos
355
356 # Read and discard data until we reach the desired position.
357 if self._mode != _MODE_READ_EOF:
358 self._read_block(offset, return_data=False)
359
360 return self._pos
361
362 def tell(self):
363 """Return the current file position."""
364 with self._lock:
365 self._check_not_closed()
366 return self._pos
367
368
369def compress(data, compresslevel=9):
370 """Compress a block of data.
371
372 compresslevel, if given, must be a number between 1 and 9.
373
374 For incremental compression, use a BZ2Compressor object instead.
375 """
376 comp = BZ2Compressor(compresslevel)
377 return comp.compress(data) + comp.flush()
378
379
380def decompress(data):
381 """Decompress a block of data.
382
383 For incremental decompression, use a BZ2Decompressor object instead.
384 """
385 if len(data) == 0:
386 return b""
387 decomp = BZ2Decompressor()
388 result = decomp.decompress(data)
389 if not decomp.eof:
390 raise ValueError("Compressed data ended before the "
391 "end-of-stream marker was reached")
392 return result