blob: a2c588e7487f3d98d442c7d7c49ae5dc312f55c8 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020012from builtins import open as _builtin_open
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Berker Peksag8bdd4482016-10-02 20:07:06 +030014import os
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020015import _compression
Nadeem Vawda72750a82012-01-18 01:57:14 +020016
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020017from _bz2 import BZ2Compressor, BZ2Decompressor
18
19
20_MODE_CLOSED = 0
21_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020022# Value 2 no longer used
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020023_MODE_WRITE = 3
24
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020025
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020026class BZ2File(_compression.BaseStream):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020027
28 """A file object providing transparent bzip2 (de)compression.
29
30 A BZ2File can act as a wrapper for an existing file object, or refer
31 directly to a named file on disk.
32
33 Note that BZ2File provides a *binary* file interface - data read is
34 returned as bytes, and data to be written should be given as bytes.
35 """
36
Victor Stinner9baf2422020-01-16 15:33:30 +010037 def __init__(self, filename, mode="r", *, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020038 """Open a bzip2-compressed file.
39
Berker Peksag8bdd4482016-10-02 20:07:06 +030040 If filename is a str, bytes, or PathLike object, it gives the
41 name of the file to be opened. Otherwise, it should be a file
42 object, which will be used to read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020043
Nadeem Vawda4907b0a2012-10-08 20:31:34 +020044 mode can be 'r' for reading (default), 'w' for (over)writing,
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020045 'x' for creating exclusively, or 'a' for appending. These can
46 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020047
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020048 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
Nadeem Vawdacac89092012-02-04 13:08:11 +020049 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020050 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020051
52 If mode is 'r', the input file may be the concatenation of
53 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020054 """
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020055 self._fp = None
56 self._closefp = False
57 self._mode = _MODE_CLOSED
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020058
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020059 if not (1 <= compresslevel <= 9):
60 raise ValueError("compresslevel must be between 1 and 9")
61
62 if mode in ("", "r", "rb"):
63 mode = "rb"
64 mode_code = _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 elif mode in ("w", "wb"):
66 mode = "wb"
67 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020068 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020069 elif mode in ("x", "xb"):
70 mode = "xb"
71 mode_code = _MODE_WRITE
72 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020073 elif mode in ("a", "ab"):
74 mode = "ab"
75 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020076 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020077 else:
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020078 raise ValueError("Invalid mode: %r" % (mode,))
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020079
Berker Peksag8bdd4482016-10-02 20:07:06 +030080 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020081 self._fp = _builtin_open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020082 self._closefp = True
83 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020084 elif hasattr(filename, "read") or hasattr(filename, "write"):
85 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020086 self._mode = mode_code
87 else:
Berker Peksag8bdd4482016-10-02 20:07:06 +030088 raise TypeError("filename must be a str, bytes, file or PathLike object")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020089
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020090 if self._mode == _MODE_READ:
91 raw = _compression.DecompressReader(self._fp,
92 BZ2Decompressor, trailing_error=OSError)
93 self._buffer = io.BufferedReader(raw)
94 else:
95 self._pos = 0
96
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020097 def close(self):
98 """Flush and close the file.
99
100 May be called more than once without error. Once the file is
101 closed, any other operation on it will raise a ValueError.
102 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900103 if self._mode == _MODE_CLOSED:
104 return
105 try:
106 if self._mode == _MODE_READ:
107 self._buffer.close()
108 elif self._mode == _MODE_WRITE:
109 self._fp.write(self._compressor.flush())
110 self._compressor = None
111 finally:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200112 try:
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900113 if self._closefp:
114 self._fp.close()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200115 finally:
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900116 self._fp = None
117 self._closefp = False
118 self._mode = _MODE_CLOSED
119 self._buffer = None
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200120
121 @property
122 def closed(self):
123 """True if this file is closed."""
124 return self._mode == _MODE_CLOSED
125
126 def fileno(self):
127 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200128 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200129 return self._fp.fileno()
130
131 def seekable(self):
132 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200133 return self.readable() and self._buffer.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200134
135 def readable(self):
136 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200137 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200138 return self._mode == _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200139
140 def writable(self):
141 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200142 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200143 return self._mode == _MODE_WRITE
144
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200145 def peek(self, n=0):
146 """Return buffered data without advancing the file position.
147
148 Always returns at least one byte of data, unless at EOF.
149 The exact number of bytes returned is unspecified.
150 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900151 self._check_can_read()
152 # Relies on the undocumented fact that BufferedReader.peek()
153 # always returns at least one byte (except at EOF), independent
154 # of the value of n
155 return self._buffer.peek(n)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200156
157 def read(self, size=-1):
158 """Read up to size uncompressed bytes from the file.
159
160 If size is negative or omitted, read until EOF is reached.
161 Returns b'' if the file is already at EOF.
162 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900163 self._check_can_read()
164 return self._buffer.read(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200165
166 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200167 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200168 making multiple reads from the underlying stream. Reads up to a
169 buffer's worth of data if size is negative.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200170
171 Returns b'' if the file is at EOF.
172 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900173 self._check_can_read()
174 if size < 0:
175 size = io.DEFAULT_BUFFER_SIZE
176 return self._buffer.read1(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200177
178 def readinto(self, b):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200179 """Read bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200180
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200181 Returns the number of bytes read (0 for EOF).
182 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900183 self._check_can_read()
184 return self._buffer.readinto(b)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200185
186 def readline(self, size=-1):
187 """Read a line of uncompressed bytes from the file.
188
189 The terminating newline (if present) is retained. If size is
190 non-negative, no more than size bytes will be read (in which
191 case the line may be incomplete). Returns b'' if already at EOF.
192 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200193 if not isinstance(size, int):
194 if not hasattr(size, "__index__"):
195 raise TypeError("Integer argument expected")
196 size = size.__index__()
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900197 self._check_can_read()
198 return self._buffer.readline(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200199
Inada Naokid2a8e692021-04-13 13:51:49 +0900200 def __iter__(self):
201 self._check_can_read()
202 return self._buffer.__iter__()
203
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200204 def readlines(self, size=-1):
205 """Read a list of lines of uncompressed bytes from the file.
206
207 size can be specified to control the number of lines read: no
208 further lines will be read once the total size of the lines read
209 so far equals or exceeds size.
210 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200211 if not isinstance(size, int):
212 if not hasattr(size, "__index__"):
213 raise TypeError("Integer argument expected")
214 size = size.__index__()
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900215 self._check_can_read()
216 return self._buffer.readlines(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200217
218 def write(self, data):
219 """Write a byte string to the file.
220
221 Returns the number of uncompressed bytes written, which is
222 always len(data). Note that due to buffering, the file on disk
223 may not reflect the data written until close() is called.
224 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900225 self._check_can_write()
226 compressed = self._compressor.compress(data)
227 self._fp.write(compressed)
228 self._pos += len(data)
229 return len(data)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200230
231 def writelines(self, seq):
232 """Write a sequence of byte strings to the file.
233
234 Returns the number of uncompressed bytes written.
235 seq can be any iterable yielding byte strings.
236
237 Line separators are not added between the written byte strings.
238 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900239 return _compression.BaseStream.writelines(self, seq)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200240
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200241 def seek(self, offset, whence=io.SEEK_SET):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200242 """Change the file position.
243
244 The new position is specified by offset, relative to the
245 position indicated by whence. Values for whence are:
246
247 0: start of stream (default); offset must not be negative
248 1: current stream position
249 2: end of stream; offset must not be positive
250
251 Returns the new file position.
252
253 Note that seeking is emulated, so depending on the parameters,
254 this operation may be extremely slow.
255 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900256 self._check_can_seek()
257 return self._buffer.seek(offset, whence)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200258
259 def tell(self):
260 """Return the current file position."""
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900261 self._check_not_closed()
262 if self._mode == _MODE_READ:
263 return self._buffer.tell()
264 return self._pos
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200265
266
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200267def open(filename, mode="rb", compresslevel=9,
268 encoding=None, errors=None, newline=None):
269 """Open a bzip2-compressed file in binary or text mode.
270
Berker Peksag8bdd4482016-10-02 20:07:06 +0300271 The filename argument can be an actual filename (a str, bytes, or
272 PathLike object), or an existing file object to read from or write
273 to.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200274
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +0200275 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
276 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
277 The default mode is "rb", and the default compresslevel is 9.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200278
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200279 For binary mode, this function is equivalent to the BZ2File
280 constructor: BZ2File(filename, mode, compresslevel). In this case,
281 the encoding, errors and newline arguments must not be provided.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200282
283 For text mode, a BZ2File object is created, and wrapped in an
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200284 io.TextIOWrapper instance with the specified encoding, error
285 handling behavior, and line ending(s).
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200286
287 """
288 if "t" in mode:
289 if "b" in mode:
290 raise ValueError("Invalid mode: %r" % (mode,))
291 else:
292 if encoding is not None:
293 raise ValueError("Argument 'encoding' not supported in binary mode")
294 if errors is not None:
295 raise ValueError("Argument 'errors' not supported in binary mode")
296 if newline is not None:
297 raise ValueError("Argument 'newline' not supported in binary mode")
298
299 bz_mode = mode.replace("t", "")
300 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
301
302 if "t" in mode:
Inada Naoki48274832021-03-29 12:28:14 +0900303 encoding = io.text_encoding(encoding)
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200304 return io.TextIOWrapper(binary_file, encoding, errors, newline)
305 else:
306 return binary_file
307
308
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200309def compress(data, compresslevel=9):
310 """Compress a block of data.
311
312 compresslevel, if given, must be a number between 1 and 9.
313
314 For incremental compression, use a BZ2Compressor object instead.
315 """
316 comp = BZ2Compressor(compresslevel)
317 return comp.compress(data) + comp.flush()
318
319
320def decompress(data):
321 """Decompress a block of data.
322
323 For incremental decompression, use a BZ2Decompressor object instead.
324 """
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200325 results = []
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100326 while data:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200327 decomp = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100328 try:
329 res = decomp.decompress(data)
330 except OSError:
331 if results:
332 break # Leftover data is not a valid bzip2 stream; ignore it.
333 else:
334 raise # Error on the first iteration; bail out.
335 results.append(res)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200336 if not decomp.eof:
337 raise ValueError("Compressed data ended before the "
338 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200339 data = decomp.unused_data
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100340 return b"".join(results)