blob: e422773b3edfb7082062b0b97f5a0833a04ade5e [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020012import _compression
Guido van Rossum15262191997-04-30 16:04:57 +000013
Zackery Spytzcf599f62019-05-13 01:50:52 -060014__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000015
Guido van Rossum15262191997-04-30 16:04:57 +000016FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +010020_COMPRESS_LEVEL_FAST = 1
21_COMPRESS_LEVEL_TRADEOFF = 6
22_COMPRESS_LEVEL_BEST = 9
23
24
25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
Nadeem Vawda7e126202012-05-06 15:04:01 +020026 encoding=None, errors=None, newline=None):
27 """Open a gzip-compressed file in binary or text mode.
28
Nadeem Vawda68721012012-06-04 23:21:38 +020029 The filename argument can be an actual filename (a str or bytes object), or
30 an existing file object to read from or write to.
31
Nadeem Vawdaee1be992013-10-19 00:11:13 +020032 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34 "rb", and the default compresslevel is 9.
Nadeem Vawda7e126202012-05-06 15:04:01 +020035
36 For binary mode, this function is equivalent to the GzipFile constructor:
37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38 and newline arguments must not be provided.
39
40 For text mode, a GzipFile object is created, and wrapped in an
41 io.TextIOWrapper instance with the specified encoding, error handling
42 behavior, and line ending(s).
43
44 """
45 if "t" in mode:
46 if "b" in mode:
47 raise ValueError("Invalid mode: %r" % (mode,))
48 else:
49 if encoding is not None:
50 raise ValueError("Argument 'encoding' not supported in binary mode")
51 if errors is not None:
52 raise ValueError("Argument 'errors' not supported in binary mode")
53 if newline is not None:
54 raise ValueError("Argument 'newline' not supported in binary mode")
Nadeem Vawda68721012012-06-04 23:21:38 +020055
56 gz_mode = mode.replace("t", "")
Berker Peksag03020cf2016-10-02 13:47:58 +030057 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda68721012012-06-04 23:21:38 +020058 binary_file = GzipFile(filename, gz_mode, compresslevel)
59 elif hasattr(filename, "read") or hasattr(filename, "write"):
60 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61 else:
62 raise TypeError("filename must be a str or bytes object, or a file")
63
Nadeem Vawda7e126202012-05-06 15:04:01 +020064 if "t" in mode:
65 return io.TextIOWrapper(binary_file, encoding, errors, newline)
66 else:
67 return binary_file
68
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000069def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000070 # The L format writes the bit pattern correctly whether signed
71 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000072 output.write(struct.pack("<L", value))
73
Antoine Pitrou7b969842010-09-23 16:22:51 +000074class _PaddedFile:
75 """Minimal read-only file object that prepends a string to the contents
76 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
77 essential functionality."""
78
79 def __init__(self, f, prepend=b''):
80 self._buffer = prepend
81 self._length = len(prepend)
82 self.file = f
83 self._read = 0
84
85 def read(self, size):
86 if self._read is None:
87 return self.file.read(size)
88 if self._read + size <= self._length:
89 read = self._read
90 self._read += size
91 return self._buffer[read:self._read]
92 else:
93 read = self._read
94 self._read = None
95 return self._buffer[read:] + \
96 self.file.read(size-self._length+read)
97
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020098 def prepend(self, prepend=b''):
Antoine Pitrou7b969842010-09-23 16:22:51 +000099 if self._read is None:
100 self._buffer = prepend
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200101 else: # Assume data was read since the last prepend() call
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102 self._read -= len(prepend)
103 return
Antoine Pitrou7b969842010-09-23 16:22:51 +0000104 self._length = len(self._buffer)
105 self._read = 0
106
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200107 def seek(self, off):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000108 self._read = None
109 self._buffer = None
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200110 return self.file.seek(off)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000111
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200112 def seekable(self):
113 return True # Allows fast-forwarding even in unseekable streams
Antoine Pitrou7b969842010-09-23 16:22:51 +0000114
Zackery Spytzcf599f62019-05-13 01:50:52 -0600115
116class BadGzipFile(OSError):
117 """Exception raised in some cases for invalid gzip files."""
118
119
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200120class GzipFile(_compression.BaseStream):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000121 """The GzipFile class simulates most of the methods of a file object with
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200122 the exception of the truncate() method.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000123
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200124 This class only supports opening files in binary mode. If you need to open a
Nadeem Vawda83a4dd32012-06-30 13:34:28 +0200125 compressed file in text mode, use the gzip.open() function.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200126
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000127 """
Guido van Rossum15262191997-04-30 16:04:57 +0000128
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200129 # Overridden with internal file object to be closed, if only a filename
130 # is passed in
Guido van Rossum68de3791997-07-19 20:22:23 +0000131 myfileobj = None
132
Tim Peters07e99cb2001-01-14 23:47:14 +0000133 def __init__(self, filename=None, mode=None,
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100134 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000135 """Constructor for the GzipFile class.
136
137 At least one of fileobj and filename must be given a
138 non-trivial value.
139
140 The new class instance is based on fileobj, which can be a regular
Serhiy Storchaka50254c52013-08-29 11:35:43 +0300141 file, an io.BytesIO object, or any other object which simulates a file.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000142 It defaults to None, in which case filename is opened to provide
143 a file object.
144
145 When fileobj is not None, the filename argument is only used to be
Martin Panter8f265652016-04-19 04:03:41 +0000146 included in the gzip file header, which may include the original
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000147 filename of the uncompressed file. It defaults to the filename of
148 fileobj, if discernible; otherwise, it defaults to the empty string,
149 and in this case the original filename is not included in the header.
150
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200151 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
152 'xb' depending on whether the file will be read or written. The default
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000153 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200154 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200155 'wb', 'a' and 'ab', and 'x' and 'xb'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000156
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100157 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000158 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100159 and 9 is slowest and produces the most compression. 0 is no compression
160 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000161
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000162 The mtime argument is an optional numeric timestamp to be written
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200163 to the last modification time field in the stream when compressing.
164 If omitted or None, the current time is used.
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000165
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000166 """
167
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200168 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200169 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000170 if mode and 'b' not in mode:
171 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000173 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000174 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200175 filename = getattr(fileobj, 'name', '')
176 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200177 filename = ''
Berker Peksag03020cf2016-10-02 13:47:58 +0300178 else:
179 filename = os.fspath(filename)
Serhiy Storchakaa0652322019-11-16 18:56:57 +0200180 origmode = mode
Guido van Rossum68de3791997-07-19 20:22:23 +0000181 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200182 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000183
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200184 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 self.mode = READ
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200186 raw = _GzipReader(fileobj)
187 self._buffer = io.BufferedReader(raw)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000188 self.name = filename
Guido van Rossum15262191997-04-30 16:04:57 +0000189
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200190 elif mode.startswith(('w', 'a', 'x')):
Serhiy Storchakaa0652322019-11-16 18:56:57 +0200191 if origmode is None:
192 import warnings
193 warnings.warn(
194 "GzipFile was opened for writing, but this will "
195 "change in future Python releases. "
196 "Specify the mode argument for opening it for writing.",
197 FutureWarning, 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000198 self.mode = WRITE
199 self._init_write(filename)
200 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000201 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000202 -zlib.MAX_WBITS,
203 zlib.DEF_MEM_LEVEL,
204 0)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200205 self._write_mtime = mtime
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000206 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200207 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000208
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000209 self.fileobj = fileobj
Guido van Rossum15262191997-04-30 16:04:57 +0000210
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 if self.mode == WRITE:
William Chargineab3b3f2020-01-21 03:25:24 -0800212 self._write_gzip_header(compresslevel)
Guido van Rossum15262191997-04-30 16:04:57 +0000213
Thomas Wouterscf297e42007-02-23 15:07:44 +0000214 @property
215 def filename(self):
216 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000217 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000218 if self.mode == WRITE and self.name[-3:] != ".gz":
219 return self.name + ".gz"
220 return self.name
221
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200222 @property
223 def mtime(self):
224 """Last modification time read from stream, or None"""
225 return self._buffer.raw._last_mtime
226
Guido van Rossum15262191997-04-30 16:04:57 +0000227 def __repr__(self):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200228 s = repr(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000229 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000230
231 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000232 self.name = filename
Martin Panterb82032f2015-12-11 05:19:29 +0000233 self.crc = zlib.crc32(b"")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000234 self.size = 0
235 self.writebuf = []
236 self.bufsize = 0
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200237 self.offset = 0 # Current file offset for seek(), tell(), etc
Guido van Rossum15262191997-04-30 16:04:57 +0000238
William Chargineab3b3f2020-01-21 03:25:24 -0800239 def _write_gzip_header(self, compresslevel):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000240 self.fileobj.write(b'\037\213') # magic header
241 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000242 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000243 # RFC 1952 requires the FNAME field to be Latin-1. Do not
244 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000245 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200246 if not isinstance(fname, bytes):
247 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000248 if fname.endswith(b'.gz'):
249 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000250 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000251 fname = b''
252 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000253 if fname:
254 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000255 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200256 mtime = self._write_mtime
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000257 if mtime is None:
258 mtime = time.time()
259 write32u(self.fileobj, int(mtime))
William Chargineab3b3f2020-01-21 03:25:24 -0800260 if compresslevel == _COMPRESS_LEVEL_BEST:
261 xfl = b'\002'
262 elif compresslevel == _COMPRESS_LEVEL_FAST:
263 xfl = b'\004'
264 else:
265 xfl = b'\000'
266 self.fileobj.write(xfl)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000267 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000268 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000269 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000270
Guido van Rossum15262191997-04-30 16:04:57 +0000271 def write(self,data):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200272 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000273 if self.mode != WRITE:
274 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200275 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000276
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000277 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000278 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000279
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200280 if isinstance(data, bytes):
281 length = len(data)
282 else:
283 # accept any data that supports the buffer protocol
284 data = memoryview(data)
285 length = data.nbytes
Antoine Pitroub1f88352010-01-03 22:37:40 +0000286
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200287 if length > 0:
288 self.fileobj.write(self.compress.compress(data))
289 self.size += length
Martin Panterb82032f2015-12-11 05:19:29 +0000290 self.crc = zlib.crc32(data, self.crc)
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200291 self.offset += length
Guido van Rossum15262191997-04-30 16:04:57 +0000292
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200293 return length
Antoine Pitroub1f88352010-01-03 22:37:40 +0000294
Guido van Rossum56068012000-02-02 16:51:06 +0000295 def read(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200296 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000297 if self.mode != READ:
298 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200299 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200300 return self._buffer.read(size)
Guido van Rossum15262191997-04-30 16:04:57 +0000301
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200302 def read1(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200303 """Implements BufferedIOBase.read1()
304
Maximilian Nöthe4f5a3492019-04-24 11:21:02 +0200305 Reads up to a buffer's worth of data if size is negative."""
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200306 self._check_not_closed()
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200307 if self.mode != READ:
308 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200309 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200310
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200311 if size < 0:
312 size = io.DEFAULT_BUFFER_SIZE
313 return self._buffer.read1(size)
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200314
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000315 def peek(self, n):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200316 self._check_not_closed()
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000317 if self.mode != READ:
318 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200319 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200320 return self._buffer.peek(n)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000321
Antoine Pitroub1f88352010-01-03 22:37:40 +0000322 @property
323 def closed(self):
324 return self.fileobj is None
325
Guido van Rossum15262191997-04-30 16:04:57 +0000326 def close(self):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300327 fileobj = self.fileobj
328 if fileobj is None:
Georg Brandlb533e262008-05-25 18:19:30 +0000329 return
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300330 self.fileobj = None
331 try:
332 if self.mode == WRITE:
333 fileobj.write(self.compress.flush())
334 write32u(fileobj, self.crc)
Victor Stinner8c663fd2017-11-08 14:44:44 -0800335 # self.size may exceed 2 GiB, or even 4 GiB
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300336 write32u(fileobj, self.size & 0xffffffff)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200337 elif self.mode == READ:
338 self._buffer.close()
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300339 finally:
340 myfileobj = self.myfileobj
341 if myfileobj:
342 self.myfileobj = None
343 myfileobj.close()
Guido van Rossum15262191997-04-30 16:04:57 +0000344
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000345 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200346 self._check_not_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000347 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000348 # Ensure the compressor's buffer is flushed
349 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000350 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000351
Tim Peters5cfb05e2004-07-27 21:02:02 +0000352 def fileno(self):
353 """Invoke the underlying file object's fileno() method.
354
355 This will raise AttributeError if the underlying file object
356 doesn't support fileno().
357 """
358 return self.fileobj.fileno()
359
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000360 def rewind(self):
361 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000362 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000363 if self.mode != READ:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200364 raise OSError("Can't rewind in write mode")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200365 self._buffer.seek(0)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000366
Antoine Pitroub1f88352010-01-03 22:37:40 +0000367 def readable(self):
368 return self.mode == READ
369
370 def writable(self):
371 return self.mode == WRITE
372
373 def seekable(self):
374 return True
375
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200376 def seek(self, offset, whence=io.SEEK_SET):
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000377 if self.mode == WRITE:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200378 if whence != io.SEEK_SET:
379 if whence == io.SEEK_CUR:
380 offset = self.offset + offset
381 else:
382 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000383 if offset < self.offset:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200384 raise OSError('Negative seek in write mode')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000385 count = offset - self.offset
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300386 chunk = b'\0' * 1024
Tim Petersfb0ea522002-11-04 19:50:11 +0000387 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000388 self.write(chunk)
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300389 self.write(b'\0' * (count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000390 elif self.mode == READ:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200391 self._check_not_closed()
392 return self._buffer.seek(offset, whence)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000393
Antoine Pitroub1f88352010-01-03 22:37:40 +0000394 return self.offset
395
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000396 def readline(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200397 self._check_not_closed()
398 return self._buffer.readline(size)
399
400
401class _GzipReader(_compression.DecompressReader):
402 def __init__(self, fp):
403 super().__init__(_PaddedFile(fp), zlib.decompressobj,
404 wbits=-zlib.MAX_WBITS)
405 # Set flag indicating start of a new member
406 self._new_member = True
407 self._last_mtime = None
408
409 def _init_read(self):
Martin Panterb82032f2015-12-11 05:19:29 +0000410 self._crc = zlib.crc32(b"")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200411 self._stream_size = 0 # Decompressed size of unconcatenated stream
412
413 def _read_exact(self, n):
414 '''Read exactly *n* bytes from `self._fp`
415
416 This method is required because self._fp may be unbuffered,
417 i.e. return short reads.
418 '''
419
420 data = self._fp.read(n)
421 while len(data) < n:
422 b = self._fp.read(n - len(data))
423 if not b:
424 raise EOFError("Compressed file ended before the "
425 "end-of-stream marker was reached")
426 data += b
427 return data
428
429 def _read_gzip_header(self):
430 magic = self._fp.read(2)
431 if magic == b'':
432 return False
433
434 if magic != b'\037\213':
Zackery Spytzcf599f62019-05-13 01:50:52 -0600435 raise BadGzipFile('Not a gzipped file (%r)' % magic)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200436
437 (method, flag,
438 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
439 if method != 8:
Zackery Spytzcf599f62019-05-13 01:50:52 -0600440 raise BadGzipFile('Unknown compression method')
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200441
442 if flag & FEXTRA:
443 # Read & discard the extra field, if present
444 extra_len, = struct.unpack("<H", self._read_exact(2))
445 self._read_exact(extra_len)
446 if flag & FNAME:
447 # Read and discard a null-terminated string containing the filename
448 while True:
449 s = self._fp.read(1)
450 if not s or s==b'\000':
451 break
452 if flag & FCOMMENT:
453 # Read and discard a null-terminated string containing a comment
454 while True:
455 s = self._fp.read(1)
456 if not s or s==b'\000':
457 break
458 if flag & FHCRC:
459 self._read_exact(2) # Read & discard the 16-bit header CRC
460 return True
461
462 def read(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463 if size < 0:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200464 return self.readall()
465 # size=0 is special because decompress(max_length=0) is not supported
466 if not size:
467 return b""
Antoine Pitroub1f88352010-01-03 22:37:40 +0000468
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200469 # For certain input data, a single
470 # call to decompress() may not return
471 # any data. In this case, retry until we get some data or reach EOF.
472 while True:
473 if self._decompressor.eof:
474 # Ending case: we've come to the end of a member in the file,
475 # so finish up this member, and read a new gzip header.
476 # Check the CRC and file size, and set the flag so we read
477 # a new member
478 self._read_eof()
479 self._new_member = True
480 self._decompressor = self._decomp_factory(
481 **self._decomp_args)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000482
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200483 if self._new_member:
484 # If the _new_member flag is set, we have to
485 # jump to the next member, if there is one.
486 self._init_read()
487 if not self._read_gzip_header():
488 self._size = self._pos
489 return b""
490 self._new_member = False
Guido van Rossum15262191997-04-30 16:04:57 +0000491
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200492 # Read a chunk of data from the file
493 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
494
495 uncompress = self._decompressor.decompress(buf, size)
496 if self._decompressor.unconsumed_tail != b"":
497 self._fp.prepend(self._decompressor.unconsumed_tail)
498 elif self._decompressor.unused_data != b"":
499 # Prepend the already read bytes to the fileobj so they can
500 # be seen by _read_eof() and _read_gzip_header()
501 self._fp.prepend(self._decompressor.unused_data)
502
503 if uncompress != b"":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000504 break
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200505 if buf == b"":
506 raise EOFError("Compressed file ended before the "
507 "end-of-stream marker was reached")
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000508
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200509 self._add_read_data( uncompress )
510 self._pos += len(uncompress)
511 return uncompress
Tim Peters07e99cb2001-01-14 23:47:14 +0000512
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200513 def _add_read_data(self, data):
Martin Panterb82032f2015-12-11 05:19:29 +0000514 self._crc = zlib.crc32(data, self._crc)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200515 self._stream_size = self._stream_size + len(data)
516
517 def _read_eof(self):
518 # We've read to the end of the file
519 # We check the that the computed CRC and size of the
520 # uncompressed data matches the stored values. Note that the size
521 # stored is the true file size mod 2**32.
522 crc32, isize = struct.unpack("<II", self._read_exact(8))
523 if crc32 != self._crc:
Zackery Spytzcf599f62019-05-13 01:50:52 -0600524 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
525 hex(self._crc)))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200526 elif isize != (self._stream_size & 0xffffffff):
Zackery Spytzcf599f62019-05-13 01:50:52 -0600527 raise BadGzipFile("Incorrect length of data produced")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200528
529 # Gzip files can be padded with zeroes and still have archives.
530 # Consume all zero bytes and set the file position to the first
531 # non-zero byte. See http://www.gzip.org/#faq8
532 c = b"\x00"
533 while c == b"\x00":
534 c = self._fp.read(1)
535 if c:
536 self._fp.prepend(c)
537
538 def _rewind(self):
539 super()._rewind()
540 self._new_member = True
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000541
guoci0e7497c2018-11-07 04:50:23 -0500542def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000543 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100544 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000545 """
546 buf = io.BytesIO()
guoci0e7497c2018-11-07 04:50:23 -0500547 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000548 f.write(data)
549 return buf.getvalue()
550
551def decompress(data):
552 """Decompress a gzip compressed string in one shot.
553 Return the decompressed string.
554 """
555 with GzipFile(fileobj=io.BytesIO(data)) as f:
556 return f.read()
557
558
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200559def main():
560 from argparse import ArgumentParser
561 parser = ArgumentParser(description=
562 "A simple command line interface for the gzip module: act like gzip, "
563 "but do not delete the input file.")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100564 group = parser.add_mutually_exclusive_group()
565 group.add_argument('--fast', action='store_true', help='compress faster')
566 group.add_argument('--best', action='store_true', help='compress better')
567 group.add_argument("-d", "--decompress", action="store_true",
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200568 help="act like gunzip instead of gzip")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100569
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200570 parser.add_argument("args", nargs="*", default=["-"], metavar='file')
571 args = parser.parse_args()
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100572
573 compresslevel = _COMPRESS_LEVEL_TRADEOFF
574 if args.fast:
575 compresslevel = _COMPRESS_LEVEL_FAST
576 elif args.best:
577 compresslevel = _COMPRESS_LEVEL_BEST
578
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200579 for arg in args.args:
580 if args.decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000581 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000582 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
583 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000584 else:
585 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000586 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000587 continue
588 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000589 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000590 else:
591 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000592 f = sys.stdin.buffer
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100593 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
594 compresslevel=compresslevel)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000595 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000596 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000597 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000598 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000599 chunk = f.read(1024)
600 if not chunk:
601 break
602 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200603 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000604 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200605 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000606 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000607
608if __name__ == '__main__':
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200609 main()