blob: 87b553df66cd8504b27e99f5cc9e8e7970745ae7 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020012import _compression
Guido van Rossum15262191997-04-30 16:04:57 +000013
Zackery Spytzcf599f62019-05-13 01:50:52 -060014__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000015
Guido van Rossum15262191997-04-30 16:04:57 +000016FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +010020_COMPRESS_LEVEL_FAST = 1
21_COMPRESS_LEVEL_TRADEOFF = 6
22_COMPRESS_LEVEL_BEST = 9
23
24
25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
Nadeem Vawda7e126202012-05-06 15:04:01 +020026 encoding=None, errors=None, newline=None):
27 """Open a gzip-compressed file in binary or text mode.
28
Nadeem Vawda68721012012-06-04 23:21:38 +020029 The filename argument can be an actual filename (a str or bytes object), or
30 an existing file object to read from or write to.
31
Nadeem Vawdaee1be992013-10-19 00:11:13 +020032 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34 "rb", and the default compresslevel is 9.
Nadeem Vawda7e126202012-05-06 15:04:01 +020035
36 For binary mode, this function is equivalent to the GzipFile constructor:
37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38 and newline arguments must not be provided.
39
40 For text mode, a GzipFile object is created, and wrapped in an
41 io.TextIOWrapper instance with the specified encoding, error handling
42 behavior, and line ending(s).
43
44 """
45 if "t" in mode:
46 if "b" in mode:
47 raise ValueError("Invalid mode: %r" % (mode,))
48 else:
49 if encoding is not None:
50 raise ValueError("Argument 'encoding' not supported in binary mode")
51 if errors is not None:
52 raise ValueError("Argument 'errors' not supported in binary mode")
53 if newline is not None:
54 raise ValueError("Argument 'newline' not supported in binary mode")
Nadeem Vawda68721012012-06-04 23:21:38 +020055
56 gz_mode = mode.replace("t", "")
Berker Peksag03020cf2016-10-02 13:47:58 +030057 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda68721012012-06-04 23:21:38 +020058 binary_file = GzipFile(filename, gz_mode, compresslevel)
59 elif hasattr(filename, "read") or hasattr(filename, "write"):
60 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61 else:
62 raise TypeError("filename must be a str or bytes object, or a file")
63
Nadeem Vawda7e126202012-05-06 15:04:01 +020064 if "t" in mode:
65 return io.TextIOWrapper(binary_file, encoding, errors, newline)
66 else:
67 return binary_file
68
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000069def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000070 # The L format writes the bit pattern correctly whether signed
71 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000072 output.write(struct.pack("<L", value))
73
Antoine Pitrou7b969842010-09-23 16:22:51 +000074class _PaddedFile:
75 """Minimal read-only file object that prepends a string to the contents
76 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
77 essential functionality."""
78
79 def __init__(self, f, prepend=b''):
80 self._buffer = prepend
81 self._length = len(prepend)
82 self.file = f
83 self._read = 0
84
85 def read(self, size):
86 if self._read is None:
87 return self.file.read(size)
88 if self._read + size <= self._length:
89 read = self._read
90 self._read += size
91 return self._buffer[read:self._read]
92 else:
93 read = self._read
94 self._read = None
95 return self._buffer[read:] + \
96 self.file.read(size-self._length+read)
97
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020098 def prepend(self, prepend=b''):
Antoine Pitrou7b969842010-09-23 16:22:51 +000099 if self._read is None:
100 self._buffer = prepend
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200101 else: # Assume data was read since the last prepend() call
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102 self._read -= len(prepend)
103 return
Antoine Pitrou7b969842010-09-23 16:22:51 +0000104 self._length = len(self._buffer)
105 self._read = 0
106
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200107 def seek(self, off):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000108 self._read = None
109 self._buffer = None
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200110 return self.file.seek(off)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000111
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200112 def seekable(self):
113 return True # Allows fast-forwarding even in unseekable streams
Antoine Pitrou7b969842010-09-23 16:22:51 +0000114
Zackery Spytzcf599f62019-05-13 01:50:52 -0600115
116class BadGzipFile(OSError):
117 """Exception raised in some cases for invalid gzip files."""
118
119
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200120class GzipFile(_compression.BaseStream):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000121 """The GzipFile class simulates most of the methods of a file object with
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200122 the exception of the truncate() method.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000123
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200124 This class only supports opening files in binary mode. If you need to open a
Nadeem Vawda83a4dd32012-06-30 13:34:28 +0200125 compressed file in text mode, use the gzip.open() function.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200126
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000127 """
Guido van Rossum15262191997-04-30 16:04:57 +0000128
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200129 # Overridden with internal file object to be closed, if only a filename
130 # is passed in
Guido van Rossum68de3791997-07-19 20:22:23 +0000131 myfileobj = None
132
Tim Peters07e99cb2001-01-14 23:47:14 +0000133 def __init__(self, filename=None, mode=None,
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100134 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000135 """Constructor for the GzipFile class.
136
137 At least one of fileobj and filename must be given a
138 non-trivial value.
139
140 The new class instance is based on fileobj, which can be a regular
Serhiy Storchaka50254c52013-08-29 11:35:43 +0300141 file, an io.BytesIO object, or any other object which simulates a file.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000142 It defaults to None, in which case filename is opened to provide
143 a file object.
144
145 When fileobj is not None, the filename argument is only used to be
Martin Panter8f265652016-04-19 04:03:41 +0000146 included in the gzip file header, which may include the original
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000147 filename of the uncompressed file. It defaults to the filename of
148 fileobj, if discernible; otherwise, it defaults to the empty string,
149 and in this case the original filename is not included in the header.
150
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200151 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
152 'xb' depending on whether the file will be read or written. The default
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000153 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200154 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200155 'wb', 'a' and 'ab', and 'x' and 'xb'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000156
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100157 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000158 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100159 and 9 is slowest and produces the most compression. 0 is no compression
160 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000161
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000162 The mtime argument is an optional numeric timestamp to be written
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200163 to the last modification time field in the stream when compressing.
164 If omitted or None, the current time is used.
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000165
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000166 """
167
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200168 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200169 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000170 if mode and 'b' not in mode:
171 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000173 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000174 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200175 filename = getattr(fileobj, 'name', '')
176 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200177 filename = ''
Berker Peksag03020cf2016-10-02 13:47:58 +0300178 else:
179 filename = os.fspath(filename)
Guido van Rossum68de3791997-07-19 20:22:23 +0000180 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200181 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000182
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200183 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000184 self.mode = READ
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200185 raw = _GzipReader(fileobj)
186 self._buffer = io.BufferedReader(raw)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000187 self.name = filename
Guido van Rossum15262191997-04-30 16:04:57 +0000188
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200189 elif mode.startswith(('w', 'a', 'x')):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000190 self.mode = WRITE
191 self._init_write(filename)
192 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000193 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000194 -zlib.MAX_WBITS,
195 zlib.DEF_MEM_LEVEL,
196 0)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200197 self._write_mtime = mtime
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000198 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200199 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000200
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000201 self.fileobj = fileobj
Guido van Rossum15262191997-04-30 16:04:57 +0000202
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000203 if self.mode == WRITE:
Miss Islington (bot)ab0d8e32020-01-21 03:42:49 -0800204 self._write_gzip_header(compresslevel)
Guido van Rossum15262191997-04-30 16:04:57 +0000205
Thomas Wouterscf297e42007-02-23 15:07:44 +0000206 @property
207 def filename(self):
208 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000209 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000210 if self.mode == WRITE and self.name[-3:] != ".gz":
211 return self.name + ".gz"
212 return self.name
213
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200214 @property
215 def mtime(self):
216 """Last modification time read from stream, or None"""
217 return self._buffer.raw._last_mtime
218
Guido van Rossum15262191997-04-30 16:04:57 +0000219 def __repr__(self):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200220 s = repr(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000221 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000222
223 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000224 self.name = filename
Martin Panterb82032f2015-12-11 05:19:29 +0000225 self.crc = zlib.crc32(b"")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000226 self.size = 0
227 self.writebuf = []
228 self.bufsize = 0
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200229 self.offset = 0 # Current file offset for seek(), tell(), etc
Guido van Rossum15262191997-04-30 16:04:57 +0000230
Miss Islington (bot)ab0d8e32020-01-21 03:42:49 -0800231 def _write_gzip_header(self, compresslevel):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000232 self.fileobj.write(b'\037\213') # magic header
233 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000234 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000235 # RFC 1952 requires the FNAME field to be Latin-1. Do not
236 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000237 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200238 if not isinstance(fname, bytes):
239 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000240 if fname.endswith(b'.gz'):
241 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000242 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000243 fname = b''
244 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000245 if fname:
246 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000247 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200248 mtime = self._write_mtime
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000249 if mtime is None:
250 mtime = time.time()
251 write32u(self.fileobj, int(mtime))
Miss Islington (bot)ab0d8e32020-01-21 03:42:49 -0800252 if compresslevel == _COMPRESS_LEVEL_BEST:
253 xfl = b'\002'
254 elif compresslevel == _COMPRESS_LEVEL_FAST:
255 xfl = b'\004'
256 else:
257 xfl = b'\000'
258 self.fileobj.write(xfl)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000259 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000260 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000261 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000262
Guido van Rossum15262191997-04-30 16:04:57 +0000263 def write(self,data):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200264 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000265 if self.mode != WRITE:
266 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200267 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000268
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000269 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000270 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000271
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200272 if isinstance(data, bytes):
273 length = len(data)
274 else:
275 # accept any data that supports the buffer protocol
276 data = memoryview(data)
277 length = data.nbytes
Antoine Pitroub1f88352010-01-03 22:37:40 +0000278
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200279 if length > 0:
280 self.fileobj.write(self.compress.compress(data))
281 self.size += length
Martin Panterb82032f2015-12-11 05:19:29 +0000282 self.crc = zlib.crc32(data, self.crc)
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200283 self.offset += length
Guido van Rossum15262191997-04-30 16:04:57 +0000284
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200285 return length
Antoine Pitroub1f88352010-01-03 22:37:40 +0000286
Guido van Rossum56068012000-02-02 16:51:06 +0000287 def read(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200288 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000289 if self.mode != READ:
290 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200291 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200292 return self._buffer.read(size)
Guido van Rossum15262191997-04-30 16:04:57 +0000293
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200294 def read1(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200295 """Implements BufferedIOBase.read1()
296
Maximilian Nöthe4f5a3492019-04-24 11:21:02 +0200297 Reads up to a buffer's worth of data if size is negative."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200298 self._check_not_closed()
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200299 if self.mode != READ:
300 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200301 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200302
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200303 if size < 0:
304 size = io.DEFAULT_BUFFER_SIZE
305 return self._buffer.read1(size)
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200306
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000307 def peek(self, n):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200308 self._check_not_closed()
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000309 if self.mode != READ:
310 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200311 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200312 return self._buffer.peek(n)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000313
Antoine Pitroub1f88352010-01-03 22:37:40 +0000314 @property
315 def closed(self):
316 return self.fileobj is None
317
Guido van Rossum15262191997-04-30 16:04:57 +0000318 def close(self):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300319 fileobj = self.fileobj
320 if fileobj is None:
Georg Brandlb533e262008-05-25 18:19:30 +0000321 return
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300322 self.fileobj = None
323 try:
324 if self.mode == WRITE:
325 fileobj.write(self.compress.flush())
326 write32u(fileobj, self.crc)
Victor Stinner8c663fd2017-11-08 14:44:44 -0800327 # self.size may exceed 2 GiB, or even 4 GiB
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300328 write32u(fileobj, self.size & 0xffffffff)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200329 elif self.mode == READ:
330 self._buffer.close()
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300331 finally:
332 myfileobj = self.myfileobj
333 if myfileobj:
334 self.myfileobj = None
335 myfileobj.close()
Guido van Rossum15262191997-04-30 16:04:57 +0000336
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000337 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200338 self._check_not_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000339 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000340 # Ensure the compressor's buffer is flushed
341 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000342 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000343
Tim Peters5cfb05e2004-07-27 21:02:02 +0000344 def fileno(self):
345 """Invoke the underlying file object's fileno() method.
346
347 This will raise AttributeError if the underlying file object
348 doesn't support fileno().
349 """
350 return self.fileobj.fileno()
351
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000352 def rewind(self):
353 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000354 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000355 if self.mode != READ:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200356 raise OSError("Can't rewind in write mode")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200357 self._buffer.seek(0)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000358
Antoine Pitroub1f88352010-01-03 22:37:40 +0000359 def readable(self):
360 return self.mode == READ
361
362 def writable(self):
363 return self.mode == WRITE
364
365 def seekable(self):
366 return True
367
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200368 def seek(self, offset, whence=io.SEEK_SET):
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000369 if self.mode == WRITE:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200370 if whence != io.SEEK_SET:
371 if whence == io.SEEK_CUR:
372 offset = self.offset + offset
373 else:
374 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000375 if offset < self.offset:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200376 raise OSError('Negative seek in write mode')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000377 count = offset - self.offset
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300378 chunk = b'\0' * 1024
Tim Petersfb0ea522002-11-04 19:50:11 +0000379 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000380 self.write(chunk)
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300381 self.write(b'\0' * (count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000382 elif self.mode == READ:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200383 self._check_not_closed()
384 return self._buffer.seek(offset, whence)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000385
Antoine Pitroub1f88352010-01-03 22:37:40 +0000386 return self.offset
387
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000388 def readline(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200389 self._check_not_closed()
390 return self._buffer.readline(size)
391
392
393class _GzipReader(_compression.DecompressReader):
394 def __init__(self, fp):
395 super().__init__(_PaddedFile(fp), zlib.decompressobj,
396 wbits=-zlib.MAX_WBITS)
397 # Set flag indicating start of a new member
398 self._new_member = True
399 self._last_mtime = None
400
401 def _init_read(self):
Martin Panterb82032f2015-12-11 05:19:29 +0000402 self._crc = zlib.crc32(b"")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200403 self._stream_size = 0 # Decompressed size of unconcatenated stream
404
405 def _read_exact(self, n):
406 '''Read exactly *n* bytes from `self._fp`
407
408 This method is required because self._fp may be unbuffered,
409 i.e. return short reads.
410 '''
411
412 data = self._fp.read(n)
413 while len(data) < n:
414 b = self._fp.read(n - len(data))
415 if not b:
416 raise EOFError("Compressed file ended before the "
417 "end-of-stream marker was reached")
418 data += b
419 return data
420
421 def _read_gzip_header(self):
422 magic = self._fp.read(2)
423 if magic == b'':
424 return False
425
426 if magic != b'\037\213':
Zackery Spytzcf599f62019-05-13 01:50:52 -0600427 raise BadGzipFile('Not a gzipped file (%r)' % magic)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200428
429 (method, flag,
430 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
431 if method != 8:
Zackery Spytzcf599f62019-05-13 01:50:52 -0600432 raise BadGzipFile('Unknown compression method')
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200433
434 if flag & FEXTRA:
435 # Read & discard the extra field, if present
436 extra_len, = struct.unpack("<H", self._read_exact(2))
437 self._read_exact(extra_len)
438 if flag & FNAME:
439 # Read and discard a null-terminated string containing the filename
440 while True:
441 s = self._fp.read(1)
442 if not s or s==b'\000':
443 break
444 if flag & FCOMMENT:
445 # Read and discard a null-terminated string containing a comment
446 while True:
447 s = self._fp.read(1)
448 if not s or s==b'\000':
449 break
450 if flag & FHCRC:
451 self._read_exact(2) # Read & discard the 16-bit header CRC
452 return True
453
454 def read(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000455 if size < 0:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200456 return self.readall()
457 # size=0 is special because decompress(max_length=0) is not supported
458 if not size:
459 return b""
Antoine Pitroub1f88352010-01-03 22:37:40 +0000460
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200461 # For certain input data, a single
462 # call to decompress() may not return
463 # any data. In this case, retry until we get some data or reach EOF.
464 while True:
465 if self._decompressor.eof:
466 # Ending case: we've come to the end of a member in the file,
467 # so finish up this member, and read a new gzip header.
468 # Check the CRC and file size, and set the flag so we read
469 # a new member
470 self._read_eof()
471 self._new_member = True
472 self._decompressor = self._decomp_factory(
473 **self._decomp_args)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000474
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200475 if self._new_member:
476 # If the _new_member flag is set, we have to
477 # jump to the next member, if there is one.
478 self._init_read()
479 if not self._read_gzip_header():
480 self._size = self._pos
481 return b""
482 self._new_member = False
Guido van Rossum15262191997-04-30 16:04:57 +0000483
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200484 # Read a chunk of data from the file
485 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
486
487 uncompress = self._decompressor.decompress(buf, size)
488 if self._decompressor.unconsumed_tail != b"":
489 self._fp.prepend(self._decompressor.unconsumed_tail)
490 elif self._decompressor.unused_data != b"":
491 # Prepend the already read bytes to the fileobj so they can
492 # be seen by _read_eof() and _read_gzip_header()
493 self._fp.prepend(self._decompressor.unused_data)
494
495 if uncompress != b"":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000496 break
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200497 if buf == b"":
498 raise EOFError("Compressed file ended before the "
499 "end-of-stream marker was reached")
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000500
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200501 self._add_read_data( uncompress )
502 self._pos += len(uncompress)
503 return uncompress
Tim Peters07e99cb2001-01-14 23:47:14 +0000504
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200505 def _add_read_data(self, data):
Martin Panterb82032f2015-12-11 05:19:29 +0000506 self._crc = zlib.crc32(data, self._crc)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200507 self._stream_size = self._stream_size + len(data)
508
509 def _read_eof(self):
510 # We've read to the end of the file
511 # We check the that the computed CRC and size of the
512 # uncompressed data matches the stored values. Note that the size
513 # stored is the true file size mod 2**32.
514 crc32, isize = struct.unpack("<II", self._read_exact(8))
515 if crc32 != self._crc:
Zackery Spytzcf599f62019-05-13 01:50:52 -0600516 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
517 hex(self._crc)))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200518 elif isize != (self._stream_size & 0xffffffff):
Zackery Spytzcf599f62019-05-13 01:50:52 -0600519 raise BadGzipFile("Incorrect length of data produced")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200520
521 # Gzip files can be padded with zeroes and still have archives.
522 # Consume all zero bytes and set the file position to the first
523 # non-zero byte. See http://www.gzip.org/#faq8
524 c = b"\x00"
525 while c == b"\x00":
526 c = self._fp.read(1)
527 if c:
528 self._fp.prepend(c)
529
530 def _rewind(self):
531 super()._rewind()
532 self._new_member = True
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000533
guoci0e7497c2018-11-07 04:50:23 -0500534def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000535 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100536 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000537 """
538 buf = io.BytesIO()
guoci0e7497c2018-11-07 04:50:23 -0500539 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000540 f.write(data)
541 return buf.getvalue()
542
543def decompress(data):
544 """Decompress a gzip compressed string in one shot.
545 Return the decompressed string.
546 """
547 with GzipFile(fileobj=io.BytesIO(data)) as f:
548 return f.read()
549
550
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200551def main():
552 from argparse import ArgumentParser
553 parser = ArgumentParser(description=
554 "A simple command line interface for the gzip module: act like gzip, "
555 "but do not delete the input file.")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100556 group = parser.add_mutually_exclusive_group()
557 group.add_argument('--fast', action='store_true', help='compress faster')
558 group.add_argument('--best', action='store_true', help='compress better')
559 group.add_argument("-d", "--decompress", action="store_true",
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200560 help="act like gunzip instead of gzip")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100561
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200562 parser.add_argument("args", nargs="*", default=["-"], metavar='file')
563 args = parser.parse_args()
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100564
565 compresslevel = _COMPRESS_LEVEL_TRADEOFF
566 if args.fast:
567 compresslevel = _COMPRESS_LEVEL_FAST
568 elif args.best:
569 compresslevel = _COMPRESS_LEVEL_BEST
570
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200571 for arg in args.args:
572 if args.decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000573 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000574 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
575 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000576 else:
577 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000578 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000579 continue
580 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000581 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000582 else:
583 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000584 f = sys.stdin.buffer
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100585 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
586 compresslevel=compresslevel)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000587 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000588 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000589 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000590 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000591 chunk = f.read(1024)
592 if not chunk:
593 break
594 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200595 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000596 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200597 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000598 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000599
600if __name__ == '__main__':
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200601 main()