blob: 7f1d20632ef139887bfb497f0261dea83d068978 [file] [log] [blame]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02001"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
Nadeem Vawdaaf518c12012-06-04 23:32:38 +02007__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8 "open", "compress", "decompress"]
Antoine Pitrou37dc5f82011-04-03 17:05:46 +02009
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020012from builtins import open as _builtin_open
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020013import io
Berker Peksag8bdd4482016-10-02 20:07:06 +030014import os
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020015import _compression
Nadeem Vawda72750a82012-01-18 01:57:14 +020016
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020017from _bz2 import BZ2Compressor, BZ2Decompressor
18
19
20_MODE_CLOSED = 0
21_MODE_READ = 1
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020022# Value 2 no longer used
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020023_MODE_WRITE = 3
24
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020025
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020026class BZ2File(_compression.BaseStream):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020027
28 """A file object providing transparent bzip2 (de)compression.
29
30 A BZ2File can act as a wrapper for an existing file object, or refer
31 directly to a named file on disk.
32
33 Note that BZ2File provides a *binary* file interface - data read is
34 returned as bytes, and data to be written should be given as bytes.
35 """
36
Victor Stinner9baf2422020-01-16 15:33:30 +010037 def __init__(self, filename, mode="r", *, compresslevel=9):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020038 """Open a bzip2-compressed file.
39
Berker Peksag8bdd4482016-10-02 20:07:06 +030040 If filename is a str, bytes, or PathLike object, it gives the
41 name of the file to be opened. Otherwise, it should be a file
42 object, which will be used to read or write the compressed data.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020043
Nadeem Vawda4907b0a2012-10-08 20:31:34 +020044 mode can be 'r' for reading (default), 'w' for (over)writing,
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020045 'x' for creating exclusively, or 'a' for appending. These can
46 equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020047
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020048 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
Nadeem Vawdacac89092012-02-04 13:08:11 +020049 and 9 specifying the level of compression: 1 produces the least
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020050 compression, and 9 (default) produces the most compression.
Nadeem Vawdacac89092012-02-04 13:08:11 +020051
52 If mode is 'r', the input file may be the concatenation of
53 multiple compressed streams.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020054 """
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020055 self._fp = None
56 self._closefp = False
57 self._mode = _MODE_CLOSED
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020058
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020059 if not (1 <= compresslevel <= 9):
60 raise ValueError("compresslevel must be between 1 and 9")
61
62 if mode in ("", "r", "rb"):
63 mode = "rb"
64 mode_code = _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020065 elif mode in ("w", "wb"):
66 mode = "wb"
67 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020068 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +020069 elif mode in ("x", "xb"):
70 mode = "xb"
71 mode_code = _MODE_WRITE
72 self._compressor = BZ2Compressor(compresslevel)
Nadeem Vawda55b43382011-05-27 01:52:15 +020073 elif mode in ("a", "ab"):
74 mode = "ab"
75 mode_code = _MODE_WRITE
Nadeem Vawda249ab5e2011-09-11 22:38:11 +020076 self._compressor = BZ2Compressor(compresslevel)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020077 else:
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020078 raise ValueError("Invalid mode: %r" % (mode,))
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020079
Berker Peksag8bdd4482016-10-02 20:07:06 +030080 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda3b4a4f52012-10-08 19:20:49 +020081 self._fp = _builtin_open(filename, mode)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020082 self._closefp = True
83 self._mode = mode_code
Nadeem Vawdaaebcdba2012-06-04 23:31:20 +020084 elif hasattr(filename, "read") or hasattr(filename, "write"):
85 self._fp = filename
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020086 self._mode = mode_code
87 else:
Berker Peksag8bdd4482016-10-02 20:07:06 +030088 raise TypeError("filename must be a str, bytes, file or PathLike object")
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020089
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020090 if self._mode == _MODE_READ:
91 raw = _compression.DecompressReader(self._fp,
92 BZ2Decompressor, trailing_error=OSError)
93 self._buffer = io.BufferedReader(raw)
94 else:
95 self._pos = 0
96
Antoine Pitrou37dc5f82011-04-03 17:05:46 +020097 def close(self):
98 """Flush and close the file.
99
100 May be called more than once without error. Once the file is
101 closed, any other operation on it will raise a ValueError.
102 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900103 if self._mode == _MODE_CLOSED:
104 return
105 try:
106 if self._mode == _MODE_READ:
107 self._buffer.close()
108 elif self._mode == _MODE_WRITE:
109 self._fp.write(self._compressor.flush())
110 self._compressor = None
111 finally:
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200112 try:
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900113 if self._closefp:
114 self._fp.close()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200115 finally:
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900116 self._fp = None
117 self._closefp = False
118 self._mode = _MODE_CLOSED
119 self._buffer = None
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200120
121 @property
122 def closed(self):
123 """True if this file is closed."""
124 return self._mode == _MODE_CLOSED
125
126 def fileno(self):
127 """Return the file descriptor for the underlying file."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200128 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200129 return self._fp.fileno()
130
131 def seekable(self):
132 """Return whether the file supports seeking."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200133 return self.readable() and self._buffer.seekable()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200134
135 def readable(self):
136 """Return whether the file was opened for reading."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200137 self._check_not_closed()
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200138 return self._mode == _MODE_READ
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200139
140 def writable(self):
141 """Return whether the file was opened for writing."""
Nadeem Vawda44ae4a22011-11-30 17:39:30 +0200142 self._check_not_closed()
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200143 return self._mode == _MODE_WRITE
144
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200145 def peek(self, n=0):
146 """Return buffered data without advancing the file position.
147
148 Always returns at least one byte of data, unless at EOF.
149 The exact number of bytes returned is unspecified.
150 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900151 self._check_can_read()
152 # Relies on the undocumented fact that BufferedReader.peek()
153 # always returns at least one byte (except at EOF), independent
154 # of the value of n
155 return self._buffer.peek(n)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200156
157 def read(self, size=-1):
158 """Read up to size uncompressed bytes from the file.
159
160 If size is negative or omitted, read until EOF is reached.
161 Returns b'' if the file is already at EOF.
162 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900163 self._check_can_read()
164 return self._buffer.read(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200165
166 def read1(self, size=-1):
Nadeem Vawda8280b4b2012-08-04 15:29:28 +0200167 """Read up to size uncompressed bytes, while trying to avoid
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200168 making multiple reads from the underlying stream. Reads up to a
169 buffer's worth of data if size is negative.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200170
171 Returns b'' if the file is at EOF.
172 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900173 self._check_can_read()
174 if size < 0:
175 size = io.DEFAULT_BUFFER_SIZE
176 return self._buffer.read1(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200177
178 def readinto(self, b):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200179 """Read bytes into b.
Antoine Pitrou24ce3862011-04-03 17:08:49 +0200180
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200181 Returns the number of bytes read (0 for EOF).
182 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900183 self._check_can_read()
184 return self._buffer.readinto(b)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200185
186 def readline(self, size=-1):
187 """Read a line of uncompressed bytes from the file.
188
189 The terminating newline (if present) is retained. If size is
190 non-negative, no more than size bytes will be read (in which
191 case the line may be incomplete). Returns b'' if already at EOF.
192 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200193 if not isinstance(size, int):
194 if not hasattr(size, "__index__"):
195 raise TypeError("Integer argument expected")
196 size = size.__index__()
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900197 self._check_can_read()
198 return self._buffer.readline(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200199
Inada Naokid2a8e692021-04-13 13:51:49 +0900200 def __iter__(self):
201 self._check_can_read()
202 return self._buffer.__iter__()
203
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200204 def readlines(self, size=-1):
205 """Read a list of lines of uncompressed bytes from the file.
206
207 size can be specified to control the number of lines read: no
208 further lines will be read once the total size of the lines read
209 so far equals or exceeds size.
210 """
Nadeem Vawdaeb70be22012-10-01 23:05:32 +0200211 if not isinstance(size, int):
212 if not hasattr(size, "__index__"):
213 raise TypeError("Integer argument expected")
214 size = size.__index__()
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900215 self._check_can_read()
216 return self._buffer.readlines(size)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200217
218 def write(self, data):
219 """Write a byte string to the file.
220
221 Returns the number of uncompressed bytes written, which is
Miss Islington (bot)01858fb2021-06-22 06:59:53 -0700222 always the length of data in bytes. Note that due to buffering,
223 the file on disk may not reflect the data written until close()
224 is called.
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200225 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900226 self._check_can_write()
Miss Islington (bot)01858fb2021-06-22 06:59:53 -0700227 if isinstance(data, (bytes, bytearray)):
228 length = len(data)
229 else:
230 # accept any data that supports the buffer protocol
231 data = memoryview(data)
232 length = data.nbytes
233
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900234 compressed = self._compressor.compress(data)
235 self._fp.write(compressed)
Miss Islington (bot)01858fb2021-06-22 06:59:53 -0700236 self._pos += length
237 return length
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200238
239 def writelines(self, seq):
240 """Write a sequence of byte strings to the file.
241
242 Returns the number of uncompressed bytes written.
243 seq can be any iterable yielding byte strings.
244
245 Line separators are not added between the written byte strings.
246 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900247 return _compression.BaseStream.writelines(self, seq)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200248
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200249 def seek(self, offset, whence=io.SEEK_SET):
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200250 """Change the file position.
251
252 The new position is specified by offset, relative to the
253 position indicated by whence. Values for whence are:
254
255 0: start of stream (default); offset must not be negative
256 1: current stream position
257 2: end of stream; offset must not be positive
258
259 Returns the new file position.
260
261 Note that seeking is emulated, so depending on the parameters,
262 this operation may be extremely slow.
263 """
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900264 self._check_can_seek()
265 return self._buffer.seek(offset, whence)
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200266
267 def tell(self):
268 """Return the current file position."""
Inada Naokicc2ffcd2021-04-12 14:46:53 +0900269 self._check_not_closed()
270 if self._mode == _MODE_READ:
271 return self._buffer.tell()
272 return self._pos
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200273
274
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200275def open(filename, mode="rb", compresslevel=9,
276 encoding=None, errors=None, newline=None):
277 """Open a bzip2-compressed file in binary or text mode.
278
Berker Peksag8bdd4482016-10-02 20:07:06 +0300279 The filename argument can be an actual filename (a str, bytes, or
280 PathLike object), or an existing file object to read from or write
281 to.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200282
Nadeem Vawda8a9e99c2013-10-19 00:11:06 +0200283 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
284 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
285 The default mode is "rb", and the default compresslevel is 9.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200286
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200287 For binary mode, this function is equivalent to the BZ2File
288 constructor: BZ2File(filename, mode, compresslevel). In this case,
289 the encoding, errors and newline arguments must not be provided.
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200290
291 For text mode, a BZ2File object is created, and wrapped in an
Nadeem Vawda4907b0a2012-10-08 20:31:34 +0200292 io.TextIOWrapper instance with the specified encoding, error
293 handling behavior, and line ending(s).
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200294
295 """
296 if "t" in mode:
297 if "b" in mode:
298 raise ValueError("Invalid mode: %r" % (mode,))
299 else:
300 if encoding is not None:
301 raise ValueError("Argument 'encoding' not supported in binary mode")
302 if errors is not None:
303 raise ValueError("Argument 'errors' not supported in binary mode")
304 if newline is not None:
305 raise ValueError("Argument 'newline' not supported in binary mode")
306
307 bz_mode = mode.replace("t", "")
308 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
309
310 if "t" in mode:
Inada Naoki48274832021-03-29 12:28:14 +0900311 encoding = io.text_encoding(encoding)
Nadeem Vawdaaf518c12012-06-04 23:32:38 +0200312 return io.TextIOWrapper(binary_file, encoding, errors, newline)
313 else:
314 return binary_file
315
316
Antoine Pitrou37dc5f82011-04-03 17:05:46 +0200317def compress(data, compresslevel=9):
318 """Compress a block of data.
319
320 compresslevel, if given, must be a number between 1 and 9.
321
322 For incremental compression, use a BZ2Compressor object instead.
323 """
324 comp = BZ2Compressor(compresslevel)
325 return comp.compress(data) + comp.flush()
326
327
328def decompress(data):
329 """Decompress a block of data.
330
331 For incremental decompression, use a BZ2Decompressor object instead.
332 """
Nadeem Vawda98838ba2011-05-30 01:12:24 +0200333 results = []
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100334 while data:
Nadeem Vawda55b43382011-05-27 01:52:15 +0200335 decomp = BZ2Decompressor()
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100336 try:
337 res = decomp.decompress(data)
338 except OSError:
339 if results:
340 break # Leftover data is not a valid bzip2 stream; ignore it.
341 else:
342 raise # Error on the first iteration; bail out.
343 results.append(res)
Nadeem Vawda55b43382011-05-27 01:52:15 +0200344 if not decomp.eof:
345 raise ValueError("Compressed data ended before the "
346 "end-of-stream marker was reached")
Nadeem Vawda55b43382011-05-27 01:52:15 +0200347 data = decomp.unused_data
Nadeem Vawda1de19ac2013-12-04 23:01:15 +0100348 return b"".join(results)