blob: e60d8ad5995b3a3f9bbdab30f6730595e86bbeb4 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020012import _compression
Guido van Rossum15262191997-04-30 16:04:57 +000013
Zackery Spytzcf599f62019-05-13 01:50:52 -060014__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000015
Guido van Rossum15262191997-04-30 16:04:57 +000016FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +010020_COMPRESS_LEVEL_FAST = 1
21_COMPRESS_LEVEL_TRADEOFF = 6
22_COMPRESS_LEVEL_BEST = 9
23
24
25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
Nadeem Vawda7e126202012-05-06 15:04:01 +020026 encoding=None, errors=None, newline=None):
27 """Open a gzip-compressed file in binary or text mode.
28
Nadeem Vawda68721012012-06-04 23:21:38 +020029 The filename argument can be an actual filename (a str or bytes object), or
30 an existing file object to read from or write to.
31
Nadeem Vawdaee1be992013-10-19 00:11:13 +020032 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34 "rb", and the default compresslevel is 9.
Nadeem Vawda7e126202012-05-06 15:04:01 +020035
36 For binary mode, this function is equivalent to the GzipFile constructor:
37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38 and newline arguments must not be provided.
39
40 For text mode, a GzipFile object is created, and wrapped in an
41 io.TextIOWrapper instance with the specified encoding, error handling
42 behavior, and line ending(s).
43
44 """
45 if "t" in mode:
46 if "b" in mode:
47 raise ValueError("Invalid mode: %r" % (mode,))
48 else:
49 if encoding is not None:
50 raise ValueError("Argument 'encoding' not supported in binary mode")
51 if errors is not None:
52 raise ValueError("Argument 'errors' not supported in binary mode")
53 if newline is not None:
54 raise ValueError("Argument 'newline' not supported in binary mode")
Nadeem Vawda68721012012-06-04 23:21:38 +020055
56 gz_mode = mode.replace("t", "")
Berker Peksag03020cf2016-10-02 13:47:58 +030057 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda68721012012-06-04 23:21:38 +020058 binary_file = GzipFile(filename, gz_mode, compresslevel)
59 elif hasattr(filename, "read") or hasattr(filename, "write"):
60 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61 else:
62 raise TypeError("filename must be a str or bytes object, or a file")
63
Nadeem Vawda7e126202012-05-06 15:04:01 +020064 if "t" in mode:
65 return io.TextIOWrapper(binary_file, encoding, errors, newline)
66 else:
67 return binary_file
68
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000069def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000070 # The L format writes the bit pattern correctly whether signed
71 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000072 output.write(struct.pack("<L", value))
73
Antoine Pitrou7b969842010-09-23 16:22:51 +000074class _PaddedFile:
75 """Minimal read-only file object that prepends a string to the contents
76 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
77 essential functionality."""
78
79 def __init__(self, f, prepend=b''):
80 self._buffer = prepend
81 self._length = len(prepend)
82 self.file = f
83 self._read = 0
84
85 def read(self, size):
86 if self._read is None:
87 return self.file.read(size)
88 if self._read + size <= self._length:
89 read = self._read
90 self._read += size
91 return self._buffer[read:self._read]
92 else:
93 read = self._read
94 self._read = None
95 return self._buffer[read:] + \
96 self.file.read(size-self._length+read)
97
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020098 def prepend(self, prepend=b''):
Antoine Pitrou7b969842010-09-23 16:22:51 +000099 if self._read is None:
100 self._buffer = prepend
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200101 else: # Assume data was read since the last prepend() call
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102 self._read -= len(prepend)
103 return
Antoine Pitrou7b969842010-09-23 16:22:51 +0000104 self._length = len(self._buffer)
105 self._read = 0
106
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200107 def seek(self, off):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000108 self._read = None
109 self._buffer = None
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200110 return self.file.seek(off)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000111
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200112 def seekable(self):
113 return True # Allows fast-forwarding even in unseekable streams
Antoine Pitrou7b969842010-09-23 16:22:51 +0000114
Zackery Spytzcf599f62019-05-13 01:50:52 -0600115
116class BadGzipFile(OSError):
117 """Exception raised in some cases for invalid gzip files."""
118
119
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200120class GzipFile(_compression.BaseStream):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000121 """The GzipFile class simulates most of the methods of a file object with
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200122 the exception of the truncate() method.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000123
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200124 This class only supports opening files in binary mode. If you need to open a
Nadeem Vawda83a4dd32012-06-30 13:34:28 +0200125 compressed file in text mode, use the gzip.open() function.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200126
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000127 """
Guido van Rossum15262191997-04-30 16:04:57 +0000128
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200129 # Overridden with internal file object to be closed, if only a filename
130 # is passed in
Guido van Rossum68de3791997-07-19 20:22:23 +0000131 myfileobj = None
132
Tim Peters07e99cb2001-01-14 23:47:14 +0000133 def __init__(self, filename=None, mode=None,
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100134 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000135 """Constructor for the GzipFile class.
136
137 At least one of fileobj and filename must be given a
138 non-trivial value.
139
140 The new class instance is based on fileobj, which can be a regular
Serhiy Storchaka50254c52013-08-29 11:35:43 +0300141 file, an io.BytesIO object, or any other object which simulates a file.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000142 It defaults to None, in which case filename is opened to provide
143 a file object.
144
145 When fileobj is not None, the filename argument is only used to be
Martin Panter8f265652016-04-19 04:03:41 +0000146 included in the gzip file header, which may include the original
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000147 filename of the uncompressed file. It defaults to the filename of
148 fileobj, if discernible; otherwise, it defaults to the empty string,
149 and in this case the original filename is not included in the header.
150
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200151 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
152 'xb' depending on whether the file will be read or written. The default
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000153 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200154 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200155 'wb', 'a' and 'ab', and 'x' and 'xb'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000156
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100157 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000158 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100159 and 9 is slowest and produces the most compression. 0 is no compression
160 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000161
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000162 The mtime argument is an optional numeric timestamp to be written
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200163 to the last modification time field in the stream when compressing.
164 If omitted or None, the current time is used.
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000165
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000166 """
167
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200168 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200169 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000170 if mode and 'b' not in mode:
171 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000173 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000174 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200175 filename = getattr(fileobj, 'name', '')
176 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200177 filename = ''
Berker Peksag03020cf2016-10-02 13:47:58 +0300178 else:
179 filename = os.fspath(filename)
Serhiy Storchakaa0652322019-11-16 18:56:57 +0200180 origmode = mode
Guido van Rossum68de3791997-07-19 20:22:23 +0000181 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200182 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000183
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200184 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 self.mode = READ
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200186 raw = _GzipReader(fileobj)
187 self._buffer = io.BufferedReader(raw)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000188 self.name = filename
Guido van Rossum15262191997-04-30 16:04:57 +0000189
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200190 elif mode.startswith(('w', 'a', 'x')):
Serhiy Storchakaa0652322019-11-16 18:56:57 +0200191 if origmode is None:
192 import warnings
193 warnings.warn(
194 "GzipFile was opened for writing, but this will "
195 "change in future Python releases. "
196 "Specify the mode argument for opening it for writing.",
197 FutureWarning, 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000198 self.mode = WRITE
199 self._init_write(filename)
200 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000201 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000202 -zlib.MAX_WBITS,
203 zlib.DEF_MEM_LEVEL,
204 0)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200205 self._write_mtime = mtime
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000206 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200207 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000208
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000209 self.fileobj = fileobj
Guido van Rossum15262191997-04-30 16:04:57 +0000210
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 if self.mode == WRITE:
212 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000213
Thomas Wouterscf297e42007-02-23 15:07:44 +0000214 @property
215 def filename(self):
216 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000217 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000218 if self.mode == WRITE and self.name[-3:] != ".gz":
219 return self.name + ".gz"
220 return self.name
221
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200222 @property
223 def mtime(self):
224 """Last modification time read from stream, or None"""
225 return self._buffer.raw._last_mtime
226
Guido van Rossum15262191997-04-30 16:04:57 +0000227 def __repr__(self):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200228 s = repr(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000229 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000230
231 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000232 self.name = filename
Martin Panterb82032f2015-12-11 05:19:29 +0000233 self.crc = zlib.crc32(b"")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000234 self.size = 0
235 self.writebuf = []
236 self.bufsize = 0
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200237 self.offset = 0 # Current file offset for seek(), tell(), etc
Guido van Rossum15262191997-04-30 16:04:57 +0000238
239 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000240 self.fileobj.write(b'\037\213') # magic header
241 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000242 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000243 # RFC 1952 requires the FNAME field to be Latin-1. Do not
244 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000245 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200246 if not isinstance(fname, bytes):
247 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000248 if fname.endswith(b'.gz'):
249 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000250 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000251 fname = b''
252 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000253 if fname:
254 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000255 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200256 mtime = self._write_mtime
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000257 if mtime is None:
258 mtime = time.time()
259 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000260 self.fileobj.write(b'\002')
261 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000262 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000263 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000264
Guido van Rossum15262191997-04-30 16:04:57 +0000265 def write(self,data):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200266 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000267 if self.mode != WRITE:
268 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200269 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000270
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000271 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000272 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000273
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200274 if isinstance(data, bytes):
275 length = len(data)
276 else:
277 # accept any data that supports the buffer protocol
278 data = memoryview(data)
279 length = data.nbytes
Antoine Pitroub1f88352010-01-03 22:37:40 +0000280
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200281 if length > 0:
282 self.fileobj.write(self.compress.compress(data))
283 self.size += length
Martin Panterb82032f2015-12-11 05:19:29 +0000284 self.crc = zlib.crc32(data, self.crc)
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200285 self.offset += length
Guido van Rossum15262191997-04-30 16:04:57 +0000286
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200287 return length
Antoine Pitroub1f88352010-01-03 22:37:40 +0000288
Guido van Rossum56068012000-02-02 16:51:06 +0000289 def read(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200290 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000291 if self.mode != READ:
292 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200293 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200294 return self._buffer.read(size)
Guido van Rossum15262191997-04-30 16:04:57 +0000295
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200296 def read1(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200297 """Implements BufferedIOBase.read1()
298
Maximilian Nöthe4f5a3492019-04-24 11:21:02 +0200299 Reads up to a buffer's worth of data if size is negative."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200300 self._check_not_closed()
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200301 if self.mode != READ:
302 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200303 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200304
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200305 if size < 0:
306 size = io.DEFAULT_BUFFER_SIZE
307 return self._buffer.read1(size)
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200308
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000309 def peek(self, n):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200310 self._check_not_closed()
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000311 if self.mode != READ:
312 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200313 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200314 return self._buffer.peek(n)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000315
Antoine Pitroub1f88352010-01-03 22:37:40 +0000316 @property
317 def closed(self):
318 return self.fileobj is None
319
Guido van Rossum15262191997-04-30 16:04:57 +0000320 def close(self):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300321 fileobj = self.fileobj
322 if fileobj is None:
Georg Brandlb533e262008-05-25 18:19:30 +0000323 return
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300324 self.fileobj = None
325 try:
326 if self.mode == WRITE:
327 fileobj.write(self.compress.flush())
328 write32u(fileobj, self.crc)
Victor Stinner8c663fd2017-11-08 14:44:44 -0800329 # self.size may exceed 2 GiB, or even 4 GiB
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300330 write32u(fileobj, self.size & 0xffffffff)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200331 elif self.mode == READ:
332 self._buffer.close()
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300333 finally:
334 myfileobj = self.myfileobj
335 if myfileobj:
336 self.myfileobj = None
337 myfileobj.close()
Guido van Rossum15262191997-04-30 16:04:57 +0000338
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000339 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200340 self._check_not_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000341 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000342 # Ensure the compressor's buffer is flushed
343 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000344 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000345
Tim Peters5cfb05e2004-07-27 21:02:02 +0000346 def fileno(self):
347 """Invoke the underlying file object's fileno() method.
348
349 This will raise AttributeError if the underlying file object
350 doesn't support fileno().
351 """
352 return self.fileobj.fileno()
353
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000354 def rewind(self):
355 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000356 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000357 if self.mode != READ:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200358 raise OSError("Can't rewind in write mode")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200359 self._buffer.seek(0)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000360
Antoine Pitroub1f88352010-01-03 22:37:40 +0000361 def readable(self):
362 return self.mode == READ
363
364 def writable(self):
365 return self.mode == WRITE
366
367 def seekable(self):
368 return True
369
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200370 def seek(self, offset, whence=io.SEEK_SET):
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000371 if self.mode == WRITE:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200372 if whence != io.SEEK_SET:
373 if whence == io.SEEK_CUR:
374 offset = self.offset + offset
375 else:
376 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000377 if offset < self.offset:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200378 raise OSError('Negative seek in write mode')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000379 count = offset - self.offset
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300380 chunk = b'\0' * 1024
Tim Petersfb0ea522002-11-04 19:50:11 +0000381 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000382 self.write(chunk)
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300383 self.write(b'\0' * (count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000384 elif self.mode == READ:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200385 self._check_not_closed()
386 return self._buffer.seek(offset, whence)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000387
Antoine Pitroub1f88352010-01-03 22:37:40 +0000388 return self.offset
389
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000390 def readline(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200391 self._check_not_closed()
392 return self._buffer.readline(size)
393
394
395class _GzipReader(_compression.DecompressReader):
396 def __init__(self, fp):
397 super().__init__(_PaddedFile(fp), zlib.decompressobj,
398 wbits=-zlib.MAX_WBITS)
399 # Set flag indicating start of a new member
400 self._new_member = True
401 self._last_mtime = None
402
403 def _init_read(self):
Martin Panterb82032f2015-12-11 05:19:29 +0000404 self._crc = zlib.crc32(b"")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200405 self._stream_size = 0 # Decompressed size of unconcatenated stream
406
407 def _read_exact(self, n):
408 '''Read exactly *n* bytes from `self._fp`
409
410 This method is required because self._fp may be unbuffered,
411 i.e. return short reads.
412 '''
413
414 data = self._fp.read(n)
415 while len(data) < n:
416 b = self._fp.read(n - len(data))
417 if not b:
418 raise EOFError("Compressed file ended before the "
419 "end-of-stream marker was reached")
420 data += b
421 return data
422
423 def _read_gzip_header(self):
424 magic = self._fp.read(2)
425 if magic == b'':
426 return False
427
428 if magic != b'\037\213':
Zackery Spytzcf599f62019-05-13 01:50:52 -0600429 raise BadGzipFile('Not a gzipped file (%r)' % magic)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200430
431 (method, flag,
432 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
433 if method != 8:
Zackery Spytzcf599f62019-05-13 01:50:52 -0600434 raise BadGzipFile('Unknown compression method')
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200435
436 if flag & FEXTRA:
437 # Read & discard the extra field, if present
438 extra_len, = struct.unpack("<H", self._read_exact(2))
439 self._read_exact(extra_len)
440 if flag & FNAME:
441 # Read and discard a null-terminated string containing the filename
442 while True:
443 s = self._fp.read(1)
444 if not s or s==b'\000':
445 break
446 if flag & FCOMMENT:
447 # Read and discard a null-terminated string containing a comment
448 while True:
449 s = self._fp.read(1)
450 if not s or s==b'\000':
451 break
452 if flag & FHCRC:
453 self._read_exact(2) # Read & discard the 16-bit header CRC
454 return True
455
456 def read(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000457 if size < 0:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200458 return self.readall()
459 # size=0 is special because decompress(max_length=0) is not supported
460 if not size:
461 return b""
Antoine Pitroub1f88352010-01-03 22:37:40 +0000462
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200463 # For certain input data, a single
464 # call to decompress() may not return
465 # any data. In this case, retry until we get some data or reach EOF.
466 while True:
467 if self._decompressor.eof:
468 # Ending case: we've come to the end of a member in the file,
469 # so finish up this member, and read a new gzip header.
470 # Check the CRC and file size, and set the flag so we read
471 # a new member
472 self._read_eof()
473 self._new_member = True
474 self._decompressor = self._decomp_factory(
475 **self._decomp_args)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000476
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200477 if self._new_member:
478 # If the _new_member flag is set, we have to
479 # jump to the next member, if there is one.
480 self._init_read()
481 if not self._read_gzip_header():
482 self._size = self._pos
483 return b""
484 self._new_member = False
Guido van Rossum15262191997-04-30 16:04:57 +0000485
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200486 # Read a chunk of data from the file
487 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
488
489 uncompress = self._decompressor.decompress(buf, size)
490 if self._decompressor.unconsumed_tail != b"":
491 self._fp.prepend(self._decompressor.unconsumed_tail)
492 elif self._decompressor.unused_data != b"":
493 # Prepend the already read bytes to the fileobj so they can
494 # be seen by _read_eof() and _read_gzip_header()
495 self._fp.prepend(self._decompressor.unused_data)
496
497 if uncompress != b"":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000498 break
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200499 if buf == b"":
500 raise EOFError("Compressed file ended before the "
501 "end-of-stream marker was reached")
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000502
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200503 self._add_read_data( uncompress )
504 self._pos += len(uncompress)
505 return uncompress
Tim Peters07e99cb2001-01-14 23:47:14 +0000506
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200507 def _add_read_data(self, data):
Martin Panterb82032f2015-12-11 05:19:29 +0000508 self._crc = zlib.crc32(data, self._crc)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200509 self._stream_size = self._stream_size + len(data)
510
511 def _read_eof(self):
512 # We've read to the end of the file
513 # We check the that the computed CRC and size of the
514 # uncompressed data matches the stored values. Note that the size
515 # stored is the true file size mod 2**32.
516 crc32, isize = struct.unpack("<II", self._read_exact(8))
517 if crc32 != self._crc:
Zackery Spytzcf599f62019-05-13 01:50:52 -0600518 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
519 hex(self._crc)))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200520 elif isize != (self._stream_size & 0xffffffff):
Zackery Spytzcf599f62019-05-13 01:50:52 -0600521 raise BadGzipFile("Incorrect length of data produced")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200522
523 # Gzip files can be padded with zeroes and still have archives.
524 # Consume all zero bytes and set the file position to the first
525 # non-zero byte. See http://www.gzip.org/#faq8
526 c = b"\x00"
527 while c == b"\x00":
528 c = self._fp.read(1)
529 if c:
530 self._fp.prepend(c)
531
532 def _rewind(self):
533 super()._rewind()
534 self._new_member = True
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000535
guoci0e7497c2018-11-07 04:50:23 -0500536def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000537 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100538 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000539 """
540 buf = io.BytesIO()
guoci0e7497c2018-11-07 04:50:23 -0500541 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000542 f.write(data)
543 return buf.getvalue()
544
545def decompress(data):
546 """Decompress a gzip compressed string in one shot.
547 Return the decompressed string.
548 """
549 with GzipFile(fileobj=io.BytesIO(data)) as f:
550 return f.read()
551
552
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200553def main():
554 from argparse import ArgumentParser
555 parser = ArgumentParser(description=
556 "A simple command line interface for the gzip module: act like gzip, "
557 "but do not delete the input file.")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100558 group = parser.add_mutually_exclusive_group()
559 group.add_argument('--fast', action='store_true', help='compress faster')
560 group.add_argument('--best', action='store_true', help='compress better')
561 group.add_argument("-d", "--decompress", action="store_true",
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200562 help="act like gunzip instead of gzip")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100563
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200564 parser.add_argument("args", nargs="*", default=["-"], metavar='file')
565 args = parser.parse_args()
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100566
567 compresslevel = _COMPRESS_LEVEL_TRADEOFF
568 if args.fast:
569 compresslevel = _COMPRESS_LEVEL_FAST
570 elif args.best:
571 compresslevel = _COMPRESS_LEVEL_BEST
572
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200573 for arg in args.args:
574 if args.decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000575 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000576 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
577 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000578 else:
579 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000580 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000581 continue
582 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000583 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000584 else:
585 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000586 f = sys.stdin.buffer
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100587 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
588 compresslevel=compresslevel)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000589 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000590 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000591 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000592 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000593 chunk = f.read(1024)
594 if not chunk:
595 break
596 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200597 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000598 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200599 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000600 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000601
602if __name__ == '__main__':
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200603 main()