blob: 21e8ff49c67b38882827d04475186593787d0dfc [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020012from builtins import open as _builtin_open
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Berker Peksag8bdd4482016-10-02 20:07:06 +030014import os
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020015import warnings
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020016import _compression
Antoine Pitroua6a4dc82017-09-07 18:56:24 +020017from threading import RLock
Nadeem Vawda72750a82012-01-18 01:57:14 +020018
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020019from _bz2 import BZ2Compressor, BZ2Decompressor
20
21
22_MODE_CLOSED = 0
23_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020024# Value 2 no longer used
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020025_MODE_WRITE = 3
26
Matthias Bussonnierffa198c2018-09-11 03:15:56 +020027_sentinel = object()
28
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020029
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020030class BZ2File(_compression.BaseStream):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020031
32 """A file object providing transparent bzip2 (de)compression.
33
34 A BZ2File can act as a wrapper for an existing file object, or refer
35 directly to a named file on disk.
36
37 Note that BZ2File provides a *binary* file interface - data read is
38 returned as bytes, and data to be written should be given as bytes.
39 """
40
Matthias Bussonnierffa198c2018-09-11 03:15:56 +020041 def __init__(self, filename, mode="r", buffering=_sentinel, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020042 """Open a bzip2-compressed file.
43
Berker Peksag8bdd4482016-10-02 20:07:06 +030044 If filename is a str, bytes, or PathLike object, it gives the
45 name of the file to be opened. Otherwise, it should be a file
46 object, which will be used to read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020047
Nadeem Vawda4907b0a2012-10-08 20:31:34 +020048 mode can be 'r' for reading (default), 'w' for (over)writing,
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020049 'x' for creating exclusively, or 'a' for appending. These can
50 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020051
Matthias Bussonnierffa198c2018-09-11 03:15:56 +020052 buffering is ignored since Python 3.0. Its use is deprecated.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020053
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020054 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
Nadeem Vawdacac89092012-02-04 13:08:11 +020055 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020056 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020057
58 If mode is 'r', the input file may be the concatenation of
59 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020060 """
61 # This lock must be recursive, so that BufferedIOBase's
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020062 # writelines() does not deadlock.
Nadeem Vawda72750a82012-01-18 01:57:14 +020063 self._lock = RLock()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020064 self._fp = None
65 self._closefp = False
66 self._mode = _MODE_CLOSED
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020067
Matthias Bussonnierffa198c2018-09-11 03:15:56 +020068 if buffering is not _sentinel:
Pablo Aguiar4b5e62d2018-11-01 11:33:35 +010069 warnings.warn("Use of 'buffering' argument is deprecated and ignored "
Matthias Bussonnierffa198c2018-09-11 03:15:56 +020070 "since Python 3.0.",
71 DeprecationWarning,
72 stacklevel=2)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020073
74 if not (1 <= compresslevel <= 9):
75 raise ValueError("compresslevel must be between 1 and 9")
76
77 if mode in ("", "r", "rb"):
78 mode = "rb"
79 mode_code = _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020080 elif mode in ("w", "wb"):
81 mode = "wb"
82 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020083 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020084 elif mode in ("x", "xb"):
85 mode = "xb"
86 mode_code = _MODE_WRITE
87 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020088 elif mode in ("a", "ab"):
89 mode = "ab"
90 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020091 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020092 else:
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020093 raise ValueError("Invalid mode: %r" % (mode,))
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020094
Berker Peksag8bdd4482016-10-02 20:07:06 +030095 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020096 self._fp = _builtin_open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020097 self._closefp = True
98 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020099 elif hasattr(filename, "read") or hasattr(filename, "write"):
100 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200101 self._mode = mode_code
102 else:
Berker Peksag8bdd4482016-10-02 20:07:06 +0300103 raise TypeError("filename must be a str, bytes, file or PathLike object")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200104
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200105 if self._mode == _MODE_READ:
106 raw = _compression.DecompressReader(self._fp,
107 BZ2Decompressor, trailing_error=OSError)
108 self._buffer = io.BufferedReader(raw)
109 else:
110 self._pos = 0
111
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200112 def close(self):
113 """Flush and close the file.
114
115 May be called more than once without error. Once the file is
116 closed, any other operation on it will raise a ValueError.
117 """
118 with self._lock:
119 if self._mode == _MODE_CLOSED:
120 return
121 try:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200122 if self._mode == _MODE_READ:
123 self._buffer.close()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200124 elif self._mode == _MODE_WRITE:
125 self._fp.write(self._compressor.flush())
126 self._compressor = None
127 finally:
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200128 try:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200129 if self._closefp:
130 self._fp.close()
131 finally:
132 self._fp = None
133 self._closefp = False
134 self._mode = _MODE_CLOSED
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200135 self._buffer = None
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200136
137 @property
138 def closed(self):
139 """True if this file is closed."""
140 return self._mode == _MODE_CLOSED
141
142 def fileno(self):
143 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200144 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200145 return self._fp.fileno()
146
147 def seekable(self):
148 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200149 return self.readable() and self._buffer.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200150
151 def readable(self):
152 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200153 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200154 return self._mode == _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200155
156 def writable(self):
157 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200158 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200159 return self._mode == _MODE_WRITE
160
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200161 def peek(self, n=0):
162 """Return buffered data without advancing the file position.
163
164 Always returns at least one byte of data, unless at EOF.
165 The exact number of bytes returned is unspecified.
166 """
167 with self._lock:
168 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200169 # Relies on the undocumented fact that BufferedReader.peek()
170 # always returns at least one byte (except at EOF), independent
171 # of the value of n
172 return self._buffer.peek(n)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200173
174 def read(self, size=-1):
175 """Read up to size uncompressed bytes from the file.
176
177 If size is negative or omitted, read until EOF is reached.
178 Returns b'' if the file is already at EOF.
179 """
180 with self._lock:
181 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200182 return self._buffer.read(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200183
184 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200185 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200186 making multiple reads from the underlying stream. Reads up to a
187 buffer's worth of data if size is negative.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200188
189 Returns b'' if the file is at EOF.
190 """
191 with self._lock:
192 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200193 if size < 0:
194 size = io.DEFAULT_BUFFER_SIZE
195 return self._buffer.read1(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200196
197 def readinto(self, b):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200198 """Read bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200199
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200200 Returns the number of bytes read (0 for EOF).
201 """
202 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200203 self._check_can_read()
204 return self._buffer.readinto(b)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200205
206 def readline(self, size=-1):
207 """Read a line of uncompressed bytes from the file.
208
209 The terminating newline (if present) is retained. If size is
210 non-negative, no more than size bytes will be read (in which
211 case the line may be incomplete). Returns b'' if already at EOF.
212 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200213 if not isinstance(size, int):
214 if not hasattr(size, "__index__"):
215 raise TypeError("Integer argument expected")
216 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200217 with self._lock:
Nadeem Vawda138ad502012-10-01 23:04:11 +0200218 self._check_can_read()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200219 return self._buffer.readline(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200220
221 def readlines(self, size=-1):
222 """Read a list of lines of uncompressed bytes from the file.
223
224 size can be specified to control the number of lines read: no
225 further lines will be read once the total size of the lines read
226 so far equals or exceeds size.
227 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200228 if not isinstance(size, int):
229 if not hasattr(size, "__index__"):
230 raise TypeError("Integer argument expected")
231 size = size.__index__()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200232 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200233 self._check_can_read()
234 return self._buffer.readlines(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200235
236 def write(self, data):
237 """Write a byte string to the file.
238
239 Returns the number of uncompressed bytes written, which is
240 always len(data). Note that due to buffering, the file on disk
241 may not reflect the data written until close() is called.
242 """
243 with self._lock:
244 self._check_can_write()
245 compressed = self._compressor.compress(data)
246 self._fp.write(compressed)
247 self._pos += len(data)
248 return len(data)
249
250 def writelines(self, seq):
251 """Write a sequence of byte strings to the file.
252
253 Returns the number of uncompressed bytes written.
254 seq can be any iterable yielding byte strings.
255
256 Line separators are not added between the written byte strings.
257 """
258 with self._lock:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200259 return _compression.BaseStream.writelines(self, seq)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200260
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200261 def seek(self, offset, whence=io.SEEK_SET):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200262 """Change the file position.
263
264 The new position is specified by offset, relative to the
265 position indicated by whence. Values for whence are:
266
267 0: start of stream (default); offset must not be negative
268 1: current stream position
269 2: end of stream; offset must not be positive
270
271 Returns the new file position.
272
273 Note that seeking is emulated, so depending on the parameters,
274 this operation may be extremely slow.
275 """
276 with self._lock:
277 self._check_can_seek()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200278 return self._buffer.seek(offset, whence)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200279
280 def tell(self):
281 """Return the current file position."""
282 with self._lock:
283 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200284 if self._mode == _MODE_READ:
285 return self._buffer.tell()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200286 return self._pos
287
288
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200289def open(filename, mode="rb", compresslevel=9,
290 encoding=None, errors=None, newline=None):
291 """Open a bzip2-compressed file in binary or text mode.
292
Berker Peksag8bdd4482016-10-02 20:07:06 +0300293 The filename argument can be an actual filename (a str, bytes, or
294 PathLike object), or an existing file object to read from or write
295 to.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200296
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +0200297 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
298 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
299 The default mode is "rb", and the default compresslevel is 9.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200300
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200301 For binary mode, this function is equivalent to the BZ2File
302 constructor: BZ2File(filename, mode, compresslevel). In this case,
303 the encoding, errors and newline arguments must not be provided.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200304
305 For text mode, a BZ2File object is created, and wrapped in an
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200306 io.TextIOWrapper instance with the specified encoding, error
307 handling behavior, and line ending(s).
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200308
309 """
310 if "t" in mode:
311 if "b" in mode:
312 raise ValueError("Invalid mode: %r" % (mode,))
313 else:
314 if encoding is not None:
315 raise ValueError("Argument 'encoding' not supported in binary mode")
316 if errors is not None:
317 raise ValueError("Argument 'errors' not supported in binary mode")
318 if newline is not None:
319 raise ValueError("Argument 'newline' not supported in binary mode")
320
321 bz_mode = mode.replace("t", "")
322 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
323
324 if "t" in mode:
325 return io.TextIOWrapper(binary_file, encoding, errors, newline)
326 else:
327 return binary_file
328
329
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200330def compress(data, compresslevel=9):
331 """Compress a block of data.
332
333 compresslevel, if given, must be a number between 1 and 9.
334
335 For incremental compression, use a BZ2Compressor object instead.
336 """
337 comp = BZ2Compressor(compresslevel)
338 return comp.compress(data) + comp.flush()
339
340
341def decompress(data):
342 """Decompress a block of data.
343
344 For incremental decompression, use a BZ2Decompressor object instead.
345 """
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200346 results = []
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100347 while data:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200348 decomp = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100349 try:
350 res = decomp.decompress(data)
351 except OSError:
352 if results:
353 break # Leftover data is not a valid bzip2 stream; ignore it.
354 else:
355 raise # Error on the first iteration; bail out.
356 results.append(res)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200357 if not decomp.eof:
358 raise ValueError("Compressed data ended before the "
359 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200360 data = decomp.unused_data
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100361 return b"".join(results)