blob: 43f321ae85239844096a91732ccad4f1d067b4c8 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020012from builtins import open as _builtin_open
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Berker Peksag8bdd4482016-10-02 20:07:06 +030014import os
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020015import _compression
Nadeem Vawda72750a82012-01-18 01:57:14 +020016
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020017from _bz2 import BZ2Compressor, BZ2Decompressor
18
19
20_MODE_CLOSED = 0
21_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020022# Value 2 no longer used
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020023_MODE_WRITE = 3
24
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020025
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020026class BZ2File(_compression.BaseStream):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020027
28 """A file object providing transparent bzip2 (de)compression.
29
30 A BZ2File can act as a wrapper for an existing file object, or refer
31 directly to a named file on disk.
32
33 Note that BZ2File provides a *binary* file interface - data read is
34 returned as bytes, and data to be written should be given as bytes.
35 """
36
Victor Stinner9baf2422020-01-16 15:33:30 +010037 def __init__(self, filename, mode="r", *, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020038 """Open a bzip2-compressed file.
39
Berker Peksag8bdd4482016-10-02 20:07:06 +030040 If filename is a str, bytes, or PathLike object, it gives the
41 name of the file to be opened. Otherwise, it should be a file
42 object, which will be used to read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020043
Nadeem Vawda4907b0a2012-10-08 20:31:34 +020044 mode can be 'r' for reading (default), 'w' for (over)writing,
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020045 'x' for creating exclusively, or 'a' for appending. These can
46 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020047
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020048 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
Nadeem Vawdacac89092012-02-04 13:08:11 +020049 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020050 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020051
52 If mode is 'r', the input file may be the concatenation of
53 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020054 """
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020055 self._fp = None
56 self._closefp = False
57 self._mode = _MODE_CLOSED
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020058
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020059 if not (1 <= compresslevel <= 9):
60 raise ValueError("compresslevel must be between 1 and 9")
61
62 if mode in ("", "r", "rb"):
63 mode = "rb"
64 mode_code = _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 elif mode in ("w", "wb"):
66 mode = "wb"
67 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020068 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020069 elif mode in ("x", "xb"):
70 mode = "xb"
71 mode_code = _MODE_WRITE
72 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020073 elif mode in ("a", "ab"):
74 mode = "ab"
75 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020076 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020077 else:
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020078 raise ValueError("Invalid mode: %r" % (mode,))
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020079
Berker Peksag8bdd4482016-10-02 20:07:06 +030080 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020081 self._fp = _builtin_open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020082 self._closefp = True
83 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020084 elif hasattr(filename, "read") or hasattr(filename, "write"):
85 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020086 self._mode = mode_code
87 else:
Berker Peksag8bdd4482016-10-02 20:07:06 +030088 raise TypeError("filename must be a str, bytes, file or PathLike object")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020089
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020090 if self._mode == _MODE_READ:
91 raw = _compression.DecompressReader(self._fp,
92 BZ2Decompressor, trailing_error=OSError)
93 self._buffer = io.BufferedReader(raw)
94 else:
95 self._pos = 0
96
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020097 def close(self):
98 """Flush and close the file.
99
100 May be called more than once without error. Once the file is
101 closed, any other operation on it will raise a ValueError.
102 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900103 if self._mode == _MODE_CLOSED:
104 return
105 try:
106 if self._mode == _MODE_READ:
107 self._buffer.close()
108 elif self._mode == _MODE_WRITE:
109 self._fp.write(self._compressor.flush())
110 self._compressor = None
111 finally:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200112 try:
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900113 if self._closefp:
114 self._fp.close()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200115 finally:
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900116 self._fp = None
117 self._closefp = False
118 self._mode = _MODE_CLOSED
119 self._buffer = None
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200120
121 @property
122 def closed(self):
123 """True if this file is closed."""
124 return self._mode == _MODE_CLOSED
125
126 def fileno(self):
127 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200128 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200129 return self._fp.fileno()
130
131 def seekable(self):
132 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200133 return self.readable() and self._buffer.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200134
135 def readable(self):
136 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200137 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200138 return self._mode == _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200139
140 def writable(self):
141 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200142 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200143 return self._mode == _MODE_WRITE
144
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200145 def peek(self, n=0):
146 """Return buffered data without advancing the file position.
147
148 Always returns at least one byte of data, unless at EOF.
149 The exact number of bytes returned is unspecified.
150 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900151 self._check_can_read()
152 # Relies on the undocumented fact that BufferedReader.peek()
153 # always returns at least one byte (except at EOF), independent
154 # of the value of n
155 return self._buffer.peek(n)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200156
157 def read(self, size=-1):
158 """Read up to size uncompressed bytes from the file.
159
160 If size is negative or omitted, read until EOF is reached.
161 Returns b'' if the file is already at EOF.
162 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900163 self._check_can_read()
164 return self._buffer.read(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200165
166 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200167 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200168 making multiple reads from the underlying stream. Reads up to a
169 buffer's worth of data if size is negative.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200170
171 Returns b'' if the file is at EOF.
172 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900173 self._check_can_read()
174 if size < 0:
175 size = io.DEFAULT_BUFFER_SIZE
176 return self._buffer.read1(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200177
178 def readinto(self, b):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200179 """Read bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200180
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200181 Returns the number of bytes read (0 for EOF).
182 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900183 self._check_can_read()
184 return self._buffer.readinto(b)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200185
186 def readline(self, size=-1):
187 """Read a line of uncompressed bytes from the file.
188
189 The terminating newline (if present) is retained. If size is
190 non-negative, no more than size bytes will be read (in which
191 case the line may be incomplete). Returns b'' if already at EOF.
192 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200193 if not isinstance(size, int):
194 if not hasattr(size, "__index__"):
195 raise TypeError("Integer argument expected")
196 size = size.__index__()
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900197 self._check_can_read()
198 return self._buffer.readline(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200199
200 def readlines(self, size=-1):
201 """Read a list of lines of uncompressed bytes from the file.
202
203 size can be specified to control the number of lines read: no
204 further lines will be read once the total size of the lines read
205 so far equals or exceeds size.
206 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200207 if not isinstance(size, int):
208 if not hasattr(size, "__index__"):
209 raise TypeError("Integer argument expected")
210 size = size.__index__()
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900211 self._check_can_read()
212 return self._buffer.readlines(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200213
214 def write(self, data):
215 """Write a byte string to the file.
216
217 Returns the number of uncompressed bytes written, which is
218 always len(data). Note that due to buffering, the file on disk
219 may not reflect the data written until close() is called.
220 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900221 self._check_can_write()
222 compressed = self._compressor.compress(data)
223 self._fp.write(compressed)
224 self._pos += len(data)
225 return len(data)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200226
227 def writelines(self, seq):
228 """Write a sequence of byte strings to the file.
229
230 Returns the number of uncompressed bytes written.
231 seq can be any iterable yielding byte strings.
232
233 Line separators are not added between the written byte strings.
234 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900235 return _compression.BaseStream.writelines(self, seq)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200236
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200237 def seek(self, offset, whence=io.SEEK_SET):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200238 """Change the file position.
239
240 The new position is specified by offset, relative to the
241 position indicated by whence. Values for whence are:
242
243 0: start of stream (default); offset must not be negative
244 1: current stream position
245 2: end of stream; offset must not be positive
246
247 Returns the new file position.
248
249 Note that seeking is emulated, so depending on the parameters,
250 this operation may be extremely slow.
251 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900252 self._check_can_seek()
253 return self._buffer.seek(offset, whence)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200254
255 def tell(self):
256 """Return the current file position."""
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900257 self._check_not_closed()
258 if self._mode == _MODE_READ:
259 return self._buffer.tell()
260 return self._pos
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200261
262
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200263def open(filename, mode="rb", compresslevel=9,
264 encoding=None, errors=None, newline=None):
265 """Open a bzip2-compressed file in binary or text mode.
266
Berker Peksag8bdd4482016-10-02 20:07:06 +0300267 The filename argument can be an actual filename (a str, bytes, or
268 PathLike object), or an existing file object to read from or write
269 to.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200270
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +0200271 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
272 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
273 The default mode is "rb", and the default compresslevel is 9.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200274
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200275 For binary mode, this function is equivalent to the BZ2File
276 constructor: BZ2File(filename, mode, compresslevel). In this case,
277 the encoding, errors and newline arguments must not be provided.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200278
279 For text mode, a BZ2File object is created, and wrapped in an
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200280 io.TextIOWrapper instance with the specified encoding, error
281 handling behavior, and line ending(s).
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200282
283 """
284 if "t" in mode:
285 if "b" in mode:
286 raise ValueError("Invalid mode: %r" % (mode,))
287 else:
288 if encoding is not None:
289 raise ValueError("Argument 'encoding' not supported in binary mode")
290 if errors is not None:
291 raise ValueError("Argument 'errors' not supported in binary mode")
292 if newline is not None:
293 raise ValueError("Argument 'newline' not supported in binary mode")
294
295 bz_mode = mode.replace("t", "")
296 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
297
298 if "t" in mode:
Inada Naoki48274832021-03-29 12:28:14 +0900299 encoding = io.text_encoding(encoding)
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200300 return io.TextIOWrapper(binary_file, encoding, errors, newline)
301 else:
302 return binary_file
303
304
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200305def compress(data, compresslevel=9):
306 """Compress a block of data.
307
308 compresslevel, if given, must be a number between 1 and 9.
309
310 For incremental compression, use a BZ2Compressor object instead.
311 """
312 comp = BZ2Compressor(compresslevel)
313 return comp.compress(data) + comp.flush()
314
315
316def decompress(data):
317 """Decompress a block of data.
318
319 For incremental decompression, use a BZ2Decompressor object instead.
320 """
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200321 results = []
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100322 while data:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200323 decomp = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100324 try:
325 res = decomp.decompress(data)
326 except OSError:
327 if results:
328 break # Leftover data is not a valid bzip2 stream; ignore it.
329 else:
330 raise # Error on the first iteration; bail out.
331 results.append(res)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200332 if not decomp.eof:
333 raise ValueError("Compressed data ended before the "
334 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200335 data = decomp.unused_data
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100336 return b"".join(results)