blob: 2968f475efad313511c2a5a8fcdd943fc18b328c [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020012import _compression
Guido van Rossum15262191997-04-30 16:04:57 +000013
Zackery Spytzcf599f62019-05-13 01:50:52 -060014__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000015
Guido van Rossum15262191997-04-30 16:04:57 +000016FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +010020_COMPRESS_LEVEL_FAST = 1
21_COMPRESS_LEVEL_TRADEOFF = 6
22_COMPRESS_LEVEL_BEST = 9
23
24
25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
Nadeem Vawda7e126202012-05-06 15:04:01 +020026 encoding=None, errors=None, newline=None):
27 """Open a gzip-compressed file in binary or text mode.
28
Nadeem Vawda68721012012-06-04 23:21:38 +020029 The filename argument can be an actual filename (a str or bytes object), or
30 an existing file object to read from or write to.
31
Nadeem Vawdaee1be992013-10-19 00:11:13 +020032 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34 "rb", and the default compresslevel is 9.
Nadeem Vawda7e126202012-05-06 15:04:01 +020035
36 For binary mode, this function is equivalent to the GzipFile constructor:
37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38 and newline arguments must not be provided.
39
40 For text mode, a GzipFile object is created, and wrapped in an
41 io.TextIOWrapper instance with the specified encoding, error handling
42 behavior, and line ending(s).
43
44 """
45 if "t" in mode:
46 if "b" in mode:
47 raise ValueError("Invalid mode: %r" % (mode,))
48 else:
49 if encoding is not None:
50 raise ValueError("Argument 'encoding' not supported in binary mode")
51 if errors is not None:
52 raise ValueError("Argument 'errors' not supported in binary mode")
53 if newline is not None:
54 raise ValueError("Argument 'newline' not supported in binary mode")
Nadeem Vawda68721012012-06-04 23:21:38 +020055
56 gz_mode = mode.replace("t", "")
Berker Peksag03020cf2016-10-02 13:47:58 +030057 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda68721012012-06-04 23:21:38 +020058 binary_file = GzipFile(filename, gz_mode, compresslevel)
59 elif hasattr(filename, "read") or hasattr(filename, "write"):
60 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61 else:
62 raise TypeError("filename must be a str or bytes object, or a file")
63
Nadeem Vawda7e126202012-05-06 15:04:01 +020064 if "t" in mode:
65 return io.TextIOWrapper(binary_file, encoding, errors, newline)
66 else:
67 return binary_file
68
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000069def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000070 # The L format writes the bit pattern correctly whether signed
71 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000072 output.write(struct.pack("<L", value))
73
Antoine Pitrou7b969842010-09-23 16:22:51 +000074class _PaddedFile:
75 """Minimal read-only file object that prepends a string to the contents
76 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
77 essential functionality."""
78
79 def __init__(self, f, prepend=b''):
80 self._buffer = prepend
81 self._length = len(prepend)
82 self.file = f
83 self._read = 0
84
85 def read(self, size):
86 if self._read is None:
87 return self.file.read(size)
88 if self._read + size <= self._length:
89 read = self._read
90 self._read += size
91 return self._buffer[read:self._read]
92 else:
93 read = self._read
94 self._read = None
95 return self._buffer[read:] + \
96 self.file.read(size-self._length+read)
97
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020098 def prepend(self, prepend=b''):
Antoine Pitrou7b969842010-09-23 16:22:51 +000099 if self._read is None:
100 self._buffer = prepend
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200101 else: # Assume data was read since the last prepend() call
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102 self._read -= len(prepend)
103 return
Antoine Pitrou7b969842010-09-23 16:22:51 +0000104 self._length = len(self._buffer)
105 self._read = 0
106
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200107 def seek(self, off):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000108 self._read = None
109 self._buffer = None
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200110 return self.file.seek(off)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000111
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200112 def seekable(self):
113 return True # Allows fast-forwarding even in unseekable streams
Antoine Pitrou7b969842010-09-23 16:22:51 +0000114
Zackery Spytzcf599f62019-05-13 01:50:52 -0600115
116class BadGzipFile(OSError):
117 """Exception raised in some cases for invalid gzip files."""
118
119
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200120class GzipFile(_compression.BaseStream):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000121 """The GzipFile class simulates most of the methods of a file object with
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200122 the exception of the truncate() method.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000123
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200124 This class only supports opening files in binary mode. If you need to open a
Nadeem Vawda83a4dd32012-06-30 13:34:28 +0200125 compressed file in text mode, use the gzip.open() function.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200126
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000127 """
Guido van Rossum15262191997-04-30 16:04:57 +0000128
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200129 # Overridden with internal file object to be closed, if only a filename
130 # is passed in
Guido van Rossum68de3791997-07-19 20:22:23 +0000131 myfileobj = None
132
Tim Peters07e99cb2001-01-14 23:47:14 +0000133 def __init__(self, filename=None, mode=None,
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100134 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000135 """Constructor for the GzipFile class.
136
137 At least one of fileobj and filename must be given a
138 non-trivial value.
139
140 The new class instance is based on fileobj, which can be a regular
Serhiy Storchaka50254c52013-08-29 11:35:43 +0300141 file, an io.BytesIO object, or any other object which simulates a file.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000142 It defaults to None, in which case filename is opened to provide
143 a file object.
144
145 When fileobj is not None, the filename argument is only used to be
Martin Panter8f265652016-04-19 04:03:41 +0000146 included in the gzip file header, which may include the original
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000147 filename of the uncompressed file. It defaults to the filename of
148 fileobj, if discernible; otherwise, it defaults to the empty string,
149 and in this case the original filename is not included in the header.
150
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200151 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
152 'xb' depending on whether the file will be read or written. The default
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000153 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200154 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200155 'wb', 'a' and 'ab', and 'x' and 'xb'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000156
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100157 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000158 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100159 and 9 is slowest and produces the most compression. 0 is no compression
160 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000161
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000162 The mtime argument is an optional numeric timestamp to be written
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200163 to the last modification time field in the stream when compressing.
164 If omitted or None, the current time is used.
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000165
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000166 """
167
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200168 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200169 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000170 if mode and 'b' not in mode:
171 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000173 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000174 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200175 filename = getattr(fileobj, 'name', '')
176 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200177 filename = ''
Berker Peksag03020cf2016-10-02 13:47:58 +0300178 else:
179 filename = os.fspath(filename)
Guido van Rossum68de3791997-07-19 20:22:23 +0000180 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200181 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000182
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200183 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000184 self.mode = READ
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200185 raw = _GzipReader(fileobj)
186 self._buffer = io.BufferedReader(raw)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000187 self.name = filename
Guido van Rossum15262191997-04-30 16:04:57 +0000188
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200189 elif mode.startswith(('w', 'a', 'x')):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000190 self.mode = WRITE
191 self._init_write(filename)
192 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000193 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000194 -zlib.MAX_WBITS,
195 zlib.DEF_MEM_LEVEL,
196 0)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200197 self._write_mtime = mtime
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000198 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200199 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000200
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000201 self.fileobj = fileobj
Guido van Rossum15262191997-04-30 16:04:57 +0000202
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000203 if self.mode == WRITE:
204 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000205
Thomas Wouterscf297e42007-02-23 15:07:44 +0000206 @property
207 def filename(self):
208 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000209 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000210 if self.mode == WRITE and self.name[-3:] != ".gz":
211 return self.name + ".gz"
212 return self.name
213
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200214 @property
215 def mtime(self):
216 """Last modification time read from stream, or None"""
217 return self._buffer.raw._last_mtime
218
Guido van Rossum15262191997-04-30 16:04:57 +0000219 def __repr__(self):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200220 s = repr(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000221 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000222
223 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000224 self.name = filename
Martin Panterb82032f2015-12-11 05:19:29 +0000225 self.crc = zlib.crc32(b"")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000226 self.size = 0
227 self.writebuf = []
228 self.bufsize = 0
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200229 self.offset = 0 # Current file offset for seek(), tell(), etc
Guido van Rossum15262191997-04-30 16:04:57 +0000230
231 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000232 self.fileobj.write(b'\037\213') # magic header
233 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000234 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000235 # RFC 1952 requires the FNAME field to be Latin-1. Do not
236 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000237 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200238 if not isinstance(fname, bytes):
239 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000240 if fname.endswith(b'.gz'):
241 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000242 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000243 fname = b''
244 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000245 if fname:
246 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000247 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200248 mtime = self._write_mtime
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000249 if mtime is None:
250 mtime = time.time()
251 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000252 self.fileobj.write(b'\002')
253 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000255 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000256
Guido van Rossum15262191997-04-30 16:04:57 +0000257 def write(self,data):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200258 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000259 if self.mode != WRITE:
260 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200261 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000262
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000263 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000264 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000265
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200266 if isinstance(data, bytes):
267 length = len(data)
268 else:
269 # accept any data that supports the buffer protocol
270 data = memoryview(data)
271 length = data.nbytes
Antoine Pitroub1f88352010-01-03 22:37:40 +0000272
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200273 if length > 0:
274 self.fileobj.write(self.compress.compress(data))
275 self.size += length
Martin Panterb82032f2015-12-11 05:19:29 +0000276 self.crc = zlib.crc32(data, self.crc)
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200277 self.offset += length
Guido van Rossum15262191997-04-30 16:04:57 +0000278
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200279 return length
Antoine Pitroub1f88352010-01-03 22:37:40 +0000280
Guido van Rossum56068012000-02-02 16:51:06 +0000281 def read(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200282 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000283 if self.mode != READ:
284 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200285 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200286 return self._buffer.read(size)
Guido van Rossum15262191997-04-30 16:04:57 +0000287
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200288 def read1(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200289 """Implements BufferedIOBase.read1()
290
Maximilian Nöthe4f5a3492019-04-24 11:21:02 +0200291 Reads up to a buffer's worth of data if size is negative."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200292 self._check_not_closed()
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200293 if self.mode != READ:
294 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200295 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200296
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200297 if size < 0:
298 size = io.DEFAULT_BUFFER_SIZE
299 return self._buffer.read1(size)
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200300
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000301 def peek(self, n):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200302 self._check_not_closed()
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000303 if self.mode != READ:
304 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200305 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200306 return self._buffer.peek(n)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000307
Antoine Pitroub1f88352010-01-03 22:37:40 +0000308 @property
309 def closed(self):
310 return self.fileobj is None
311
Guido van Rossum15262191997-04-30 16:04:57 +0000312 def close(self):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300313 fileobj = self.fileobj
314 if fileobj is None:
Georg Brandlb533e262008-05-25 18:19:30 +0000315 return
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300316 self.fileobj = None
317 try:
318 if self.mode == WRITE:
319 fileobj.write(self.compress.flush())
320 write32u(fileobj, self.crc)
Victor Stinner8c663fd2017-11-08 14:44:44 -0800321 # self.size may exceed 2 GiB, or even 4 GiB
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300322 write32u(fileobj, self.size & 0xffffffff)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200323 elif self.mode == READ:
324 self._buffer.close()
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300325 finally:
326 myfileobj = self.myfileobj
327 if myfileobj:
328 self.myfileobj = None
329 myfileobj.close()
Guido van Rossum15262191997-04-30 16:04:57 +0000330
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000331 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200332 self._check_not_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000333 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000334 # Ensure the compressor's buffer is flushed
335 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000336 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000337
Tim Peters5cfb05e2004-07-27 21:02:02 +0000338 def fileno(self):
339 """Invoke the underlying file object's fileno() method.
340
341 This will raise AttributeError if the underlying file object
342 doesn't support fileno().
343 """
344 return self.fileobj.fileno()
345
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000346 def rewind(self):
347 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000348 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000349 if self.mode != READ:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200350 raise OSError("Can't rewind in write mode")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200351 self._buffer.seek(0)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000352
Antoine Pitroub1f88352010-01-03 22:37:40 +0000353 def readable(self):
354 return self.mode == READ
355
356 def writable(self):
357 return self.mode == WRITE
358
359 def seekable(self):
360 return True
361
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200362 def seek(self, offset, whence=io.SEEK_SET):
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000363 if self.mode == WRITE:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200364 if whence != io.SEEK_SET:
365 if whence == io.SEEK_CUR:
366 offset = self.offset + offset
367 else:
368 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000369 if offset < self.offset:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200370 raise OSError('Negative seek in write mode')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000371 count = offset - self.offset
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300372 chunk = b'\0' * 1024
Tim Petersfb0ea522002-11-04 19:50:11 +0000373 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000374 self.write(chunk)
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300375 self.write(b'\0' * (count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000376 elif self.mode == READ:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200377 self._check_not_closed()
378 return self._buffer.seek(offset, whence)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000379
Antoine Pitroub1f88352010-01-03 22:37:40 +0000380 return self.offset
381
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000382 def readline(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200383 self._check_not_closed()
384 return self._buffer.readline(size)
385
386
387class _GzipReader(_compression.DecompressReader):
388 def __init__(self, fp):
389 super().__init__(_PaddedFile(fp), zlib.decompressobj,
390 wbits=-zlib.MAX_WBITS)
391 # Set flag indicating start of a new member
392 self._new_member = True
393 self._last_mtime = None
394
395 def _init_read(self):
Martin Panterb82032f2015-12-11 05:19:29 +0000396 self._crc = zlib.crc32(b"")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200397 self._stream_size = 0 # Decompressed size of unconcatenated stream
398
399 def _read_exact(self, n):
400 '''Read exactly *n* bytes from `self._fp`
401
402 This method is required because self._fp may be unbuffered,
403 i.e. return short reads.
404 '''
405
406 data = self._fp.read(n)
407 while len(data) < n:
408 b = self._fp.read(n - len(data))
409 if not b:
410 raise EOFError("Compressed file ended before the "
411 "end-of-stream marker was reached")
412 data += b
413 return data
414
415 def _read_gzip_header(self):
416 magic = self._fp.read(2)
417 if magic == b'':
418 return False
419
420 if magic != b'\037\213':
Zackery Spytzcf599f62019-05-13 01:50:52 -0600421 raise BadGzipFile('Not a gzipped file (%r)' % magic)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200422
423 (method, flag,
424 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
425 if method != 8:
Zackery Spytzcf599f62019-05-13 01:50:52 -0600426 raise BadGzipFile('Unknown compression method')
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200427
428 if flag & FEXTRA:
429 # Read & discard the extra field, if present
430 extra_len, = struct.unpack("<H", self._read_exact(2))
431 self._read_exact(extra_len)
432 if flag & FNAME:
433 # Read and discard a null-terminated string containing the filename
434 while True:
435 s = self._fp.read(1)
436 if not s or s==b'\000':
437 break
438 if flag & FCOMMENT:
439 # Read and discard a null-terminated string containing a comment
440 while True:
441 s = self._fp.read(1)
442 if not s or s==b'\000':
443 break
444 if flag & FHCRC:
445 self._read_exact(2) # Read & discard the 16-bit header CRC
446 return True
447
448 def read(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449 if size < 0:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200450 return self.readall()
451 # size=0 is special because decompress(max_length=0) is not supported
452 if not size:
453 return b""
Antoine Pitroub1f88352010-01-03 22:37:40 +0000454
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200455 # For certain input data, a single
456 # call to decompress() may not return
457 # any data. In this case, retry until we get some data or reach EOF.
458 while True:
459 if self._decompressor.eof:
460 # Ending case: we've come to the end of a member in the file,
461 # so finish up this member, and read a new gzip header.
462 # Check the CRC and file size, and set the flag so we read
463 # a new member
464 self._read_eof()
465 self._new_member = True
466 self._decompressor = self._decomp_factory(
467 **self._decomp_args)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000468
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200469 if self._new_member:
470 # If the _new_member flag is set, we have to
471 # jump to the next member, if there is one.
472 self._init_read()
473 if not self._read_gzip_header():
474 self._size = self._pos
475 return b""
476 self._new_member = False
Guido van Rossum15262191997-04-30 16:04:57 +0000477
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200478 # Read a chunk of data from the file
479 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
480
481 uncompress = self._decompressor.decompress(buf, size)
482 if self._decompressor.unconsumed_tail != b"":
483 self._fp.prepend(self._decompressor.unconsumed_tail)
484 elif self._decompressor.unused_data != b"":
485 # Prepend the already read bytes to the fileobj so they can
486 # be seen by _read_eof() and _read_gzip_header()
487 self._fp.prepend(self._decompressor.unused_data)
488
489 if uncompress != b"":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000490 break
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200491 if buf == b"":
492 raise EOFError("Compressed file ended before the "
493 "end-of-stream marker was reached")
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000494
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200495 self._add_read_data( uncompress )
496 self._pos += len(uncompress)
497 return uncompress
Tim Peters07e99cb2001-01-14 23:47:14 +0000498
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200499 def _add_read_data(self, data):
Martin Panterb82032f2015-12-11 05:19:29 +0000500 self._crc = zlib.crc32(data, self._crc)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200501 self._stream_size = self._stream_size + len(data)
502
503 def _read_eof(self):
504 # We've read to the end of the file
505 # We check the that the computed CRC and size of the
506 # uncompressed data matches the stored values. Note that the size
507 # stored is the true file size mod 2**32.
508 crc32, isize = struct.unpack("<II", self._read_exact(8))
509 if crc32 != self._crc:
Zackery Spytzcf599f62019-05-13 01:50:52 -0600510 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
511 hex(self._crc)))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200512 elif isize != (self._stream_size & 0xffffffff):
Zackery Spytzcf599f62019-05-13 01:50:52 -0600513 raise BadGzipFile("Incorrect length of data produced")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200514
515 # Gzip files can be padded with zeroes and still have archives.
516 # Consume all zero bytes and set the file position to the first
517 # non-zero byte. See http://www.gzip.org/#faq8
518 c = b"\x00"
519 while c == b"\x00":
520 c = self._fp.read(1)
521 if c:
522 self._fp.prepend(c)
523
524 def _rewind(self):
525 super()._rewind()
526 self._new_member = True
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000527
guoci0e7497c2018-11-07 04:50:23 -0500528def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000529 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100530 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000531 """
532 buf = io.BytesIO()
guoci0e7497c2018-11-07 04:50:23 -0500533 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000534 f.write(data)
535 return buf.getvalue()
536
537def decompress(data):
538 """Decompress a gzip compressed string in one shot.
539 Return the decompressed string.
540 """
541 with GzipFile(fileobj=io.BytesIO(data)) as f:
542 return f.read()
543
544
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200545def main():
546 from argparse import ArgumentParser
547 parser = ArgumentParser(description=
548 "A simple command line interface for the gzip module: act like gzip, "
549 "but do not delete the input file.")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100550 group = parser.add_mutually_exclusive_group()
551 group.add_argument('--fast', action='store_true', help='compress faster')
552 group.add_argument('--best', action='store_true', help='compress better')
553 group.add_argument("-d", "--decompress", action="store_true",
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200554 help="act like gunzip instead of gzip")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100555
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200556 parser.add_argument("args", nargs="*", default=["-"], metavar='file')
557 args = parser.parse_args()
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100558
559 compresslevel = _COMPRESS_LEVEL_TRADEOFF
560 if args.fast:
561 compresslevel = _COMPRESS_LEVEL_FAST
562 elif args.best:
563 compresslevel = _COMPRESS_LEVEL_BEST
564
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200565 for arg in args.args:
566 if args.decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000567 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000568 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
569 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000570 else:
571 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000572 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000573 continue
574 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000575 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000576 else:
577 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000578 f = sys.stdin.buffer
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100579 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
580 compresslevel=compresslevel)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000581 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000582 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000583 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000584 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000585 chunk = f.read(1024)
586 if not chunk:
587 break
588 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200589 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000590 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200591 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000592 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000593
594if __name__ == '__main__':
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200595 main()