blob: bc78c544851203bf0c25592e642b6188488d25e9 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020012from builtins import open as _builtin_open
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020014import warnings
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020015import _compression
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020016
Nadeem Vawda72750a82012-01-18 01:57:14 +020017try:
18 from threading import RLock
Brett Cannoncd171c82013-07-04 17:43:24 -040019except ImportError:
Nadeem Vawda72750a82012-01-18 01:57:14 +020020 from dummy_threading import RLock
21
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020022from _bz2 import BZ2Compressor, BZ2Decompressor
23
24
25_MODE_CLOSED = 0
26_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020027# Value 2 no longer used
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020028_MODE_WRITE = 3
29
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020030
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020031class BZ2File(_compression.BaseStream):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020032
33 """A file object providing transparent bzip2 (de)compression.
34
35 A BZ2File can act as a wrapper for an existing file object, or refer
36 directly to a named file on disk.
37
38 Note that BZ2File provides a *binary* file interface - data read is
39 returned as bytes, and data to be written should be given as bytes.
40 """
41
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020042 def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020043 """Open a bzip2-compressed file.
44
Nadeem Vawda4907b0a2012-10-08 20:31:34 +020045 If filename is a str or bytes object, it gives the name
46 of the file to be opened. Otherwise, it should be a file object,
47 which will be used to read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020048
Nadeem Vawda4907b0a2012-10-08 20:31:34 +020049 mode can be 'r' for reading (default), 'w' for (over)writing,
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020050 'x' for creating exclusively, or 'a' for appending. These can
51 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020052
53 buffering is ignored. Its use is deprecated.
54
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020055 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
Nadeem Vawdacac89092012-02-04 13:08:11 +020056 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020057 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020058
59 If mode is 'r', the input file may be the concatenation of
60 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020061 """
62 # This lock must be recursive, so that BufferedIOBase's
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020063 # writelines() does not deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020064 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 self._fp = None
66 self._closefp = False
67 self._mode = _MODE_CLOSED
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020068
69 if buffering is not None:
70 warnings.warn("Use of 'buffering' argument is deprecated",
71 DeprecationWarning)
72
73 if not (1 <= compresslevel <= 9):
74 raise ValueError("compresslevel must be between 1 and 9")
75
76 if mode in ("", "r", "rb"):
77 mode = "rb"
78 mode_code = _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020079 elif mode in ("w", "wb"):
80 mode = "wb"
81 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020082 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020083 elif mode in ("x", "xb"):
84 mode = "xb"
85 mode_code = _MODE_WRITE
86 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020087 elif mode in ("a", "ab"):
88 mode = "ab"
89 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020090 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020091 else:
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020092 raise ValueError("Invalid mode: %r" % (mode,))
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020093
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020094 if isinstance(filename, (str, bytes)):
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020095 self._fp = _builtin_open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020096 self._closefp = True
97 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020098 elif hasattr(filename, "read") or hasattr(filename, "write"):
99 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200100 self._mode = mode_code
101 else:
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +0200102 raise TypeError("filename must be a str or bytes object, or a file")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200103
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200104 if self._mode == _MODE_READ:
105 raw = _compression.DecompressReader(self._fp,
106 BZ2Decompressor, trailing_error=OSError)
107 self._buffer = io.BufferedReader(raw)
108 else:
109 self._pos = 0
110
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200111 def close(self):
112 """Flush and close the file.
113
114 May be called more than once without error. Once the file is
115 closed, any other operation on it will raise a ValueError.
116 """
117 with self._lock:
118 if self._mode == _MODE_CLOSED:
119 return
120 try:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200121 if self._mode == _MODE_READ:
122 self._buffer.close()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200123 elif self._mode == _MODE_WRITE:
124 self._fp.write(self._compressor.flush())
125 self._compressor = None
126 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200127 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200128 if self._closefp:
129 self._fp.close()
130 finally:
131 self._fp = None
132 self._closefp = False
133 self._mode = _MODE_CLOSED
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200134 self._buffer = None
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200135
136 @property
137 def closed(self):
138 """True if this file is closed."""
139 return self._mode == _MODE_CLOSED
140
141 def fileno(self):
142 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200143 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200144 return self._fp.fileno()
145
146 def seekable(self):
147 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200148 return self.readable() and self._buffer.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200149
150 def readable(self):
151 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200152 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200153 return self._mode == _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200154
155 def writable(self):
156 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200157 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200158 return self._mode == _MODE_WRITE
159
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200160 def peek(self, n=0):
161 """Return buffered data without advancing the file position.
162
163 Always returns at least one byte of data, unless at EOF.
164 The exact number of bytes returned is unspecified.
165 """
166 with self._lock:
167 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200168 # Relies on the undocumented fact that BufferedReader.peek()
169 # always returns at least one byte (except at EOF), independent
170 # of the value of n
171 return self._buffer.peek(n)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200172
173 def read(self, size=-1):
174 """Read up to size uncompressed bytes from the file.
175
176 If size is negative or omitted, read until EOF is reached.
177 Returns b'' if the file is already at EOF.
178 """
179 with self._lock:
180 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200181 return self._buffer.read(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200182
183 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200184 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200185 making multiple reads from the underlying stream. Reads up to a
186 buffer's worth of data if size is negative.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200187
188 Returns b'' if the file is at EOF.
189 """
190 with self._lock:
191 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200192 if size < 0:
193 size = io.DEFAULT_BUFFER_SIZE
194 return self._buffer.read1(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200195
196 def readinto(self, b):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200197 """Read bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200198
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200199 Returns the number of bytes read (0 for EOF).
200 """
201 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200202 self._check_can_read()
203 return self._buffer.readinto(b)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200204
205 def readline(self, size=-1):
206 """Read a line of uncompressed bytes from the file.
207
208 The terminating newline (if present) is retained. If size is
209 non-negative, no more than size bytes will be read (in which
210 case the line may be incomplete). Returns b'' if already at EOF.
211 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200212 if not isinstance(size, int):
213 if not hasattr(size, "__index__"):
214 raise TypeError("Integer argument expected")
215 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200216 with self._lock:
Nadeem Vawda138ad502012-10-01 23:04:11 +0200217 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200218 return self._buffer.readline(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200219
220 def readlines(self, size=-1):
221 """Read a list of lines of uncompressed bytes from the file.
222
223 size can be specified to control the number of lines read: no
224 further lines will be read once the total size of the lines read
225 so far equals or exceeds size.
226 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200227 if not isinstance(size, int):
228 if not hasattr(size, "__index__"):
229 raise TypeError("Integer argument expected")
230 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200231 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200232 self._check_can_read()
233 return self._buffer.readlines(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200234
235 def write(self, data):
236 """Write a byte string to the file.
237
238 Returns the number of uncompressed bytes written, which is
239 always len(data). Note that due to buffering, the file on disk
240 may not reflect the data written until close() is called.
241 """
242 with self._lock:
243 self._check_can_write()
244 compressed = self._compressor.compress(data)
245 self._fp.write(compressed)
246 self._pos += len(data)
247 return len(data)
248
249 def writelines(self, seq):
250 """Write a sequence of byte strings to the file.
251
252 Returns the number of uncompressed bytes written.
253 seq can be any iterable yielding byte strings.
254
255 Line separators are not added between the written byte strings.
256 """
257 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200258 return _compression.BaseStream.writelines(self, seq)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200259
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200260 def seek(self, offset, whence=io.SEEK_SET):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200261 """Change the file position.
262
263 The new position is specified by offset, relative to the
264 position indicated by whence. Values for whence are:
265
266 0: start of stream (default); offset must not be negative
267 1: current stream position
268 2: end of stream; offset must not be positive
269
270 Returns the new file position.
271
272 Note that seeking is emulated, so depending on the parameters,
273 this operation may be extremely slow.
274 """
275 with self._lock:
276 self._check_can_seek()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200277 return self._buffer.seek(offset, whence)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200278
279 def tell(self):
280 """Return the current file position."""
281 with self._lock:
282 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200283 if self._mode == _MODE_READ:
284 return self._buffer.tell()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200285 return self._pos
286
287
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200288def open(filename, mode="rb", compresslevel=9,
289 encoding=None, errors=None, newline=None):
290 """Open a bzip2-compressed file in binary or text mode.
291
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200292 The filename argument can be an actual filename (a str or bytes
293 object), or an existing file object to read from or write to.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200294
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +0200295 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
296 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
297 The default mode is "rb", and the default compresslevel is 9.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200298
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200299 For binary mode, this function is equivalent to the BZ2File
300 constructor: BZ2File(filename, mode, compresslevel). In this case,
301 the encoding, errors and newline arguments must not be provided.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200302
303 For text mode, a BZ2File object is created, and wrapped in an
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200304 io.TextIOWrapper instance with the specified encoding, error
305 handling behavior, and line ending(s).
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200306
307 """
308 if "t" in mode:
309 if "b" in mode:
310 raise ValueError("Invalid mode: %r" % (mode,))
311 else:
312 if encoding is not None:
313 raise ValueError("Argument 'encoding' not supported in binary mode")
314 if errors is not None:
315 raise ValueError("Argument 'errors' not supported in binary mode")
316 if newline is not None:
317 raise ValueError("Argument 'newline' not supported in binary mode")
318
319 bz_mode = mode.replace("t", "")
320 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
321
322 if "t" in mode:
323 return io.TextIOWrapper(binary_file, encoding, errors, newline)
324 else:
325 return binary_file
326
327
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200328def compress(data, compresslevel=9):
329 """Compress a block of data.
330
331 compresslevel, if given, must be a number between 1 and 9.
332
333 For incremental compression, use a BZ2Compressor object instead.
334 """
335 comp = BZ2Compressor(compresslevel)
336 return comp.compress(data) + comp.flush()
337
338
339def decompress(data):
340 """Decompress a block of data.
341
342 For incremental decompression, use a BZ2Decompressor object instead.
343 """
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200344 results = []
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100345 while data:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200346 decomp = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100347 try:
348 res = decomp.decompress(data)
349 except OSError:
350 if results:
351 break # Leftover data is not a valid bzip2 stream; ignore it.
352 else:
353 raise # Error on the first iteration; bail out.
354 results.append(res)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200355 if not decomp.eof:
356 raise ValueError("Compressed data ended before the "
357 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200358 data = decomp.unused_data
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100359 return b"".join(results)