blob: 948fec293e23d95a45b62e5303ef6fd023e2a707 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020012import _compression
Guido van Rossum15262191997-04-30 16:04:57 +000013
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000014__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000015
Guido van Rossum15262191997-04-30 16:04:57 +000016FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +010020_COMPRESS_LEVEL_FAST = 1
21_COMPRESS_LEVEL_TRADEOFF = 6
22_COMPRESS_LEVEL_BEST = 9
23
24
25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
Nadeem Vawda7e126202012-05-06 15:04:01 +020026 encoding=None, errors=None, newline=None):
27 """Open a gzip-compressed file in binary or text mode.
28
Nadeem Vawda68721012012-06-04 23:21:38 +020029 The filename argument can be an actual filename (a str or bytes object), or
30 an existing file object to read from or write to.
31
Nadeem Vawdaee1be992013-10-19 00:11:13 +020032 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34 "rb", and the default compresslevel is 9.
Nadeem Vawda7e126202012-05-06 15:04:01 +020035
36 For binary mode, this function is equivalent to the GzipFile constructor:
37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38 and newline arguments must not be provided.
39
40 For text mode, a GzipFile object is created, and wrapped in an
41 io.TextIOWrapper instance with the specified encoding, error handling
42 behavior, and line ending(s).
43
44 """
45 if "t" in mode:
46 if "b" in mode:
47 raise ValueError("Invalid mode: %r" % (mode,))
48 else:
49 if encoding is not None:
50 raise ValueError("Argument 'encoding' not supported in binary mode")
51 if errors is not None:
52 raise ValueError("Argument 'errors' not supported in binary mode")
53 if newline is not None:
54 raise ValueError("Argument 'newline' not supported in binary mode")
Nadeem Vawda68721012012-06-04 23:21:38 +020055
56 gz_mode = mode.replace("t", "")
Berker Peksag03020cf2016-10-02 13:47:58 +030057 if isinstance(filename, (str, bytes, os.PathLike)):
Nadeem Vawda68721012012-06-04 23:21:38 +020058 binary_file = GzipFile(filename, gz_mode, compresslevel)
59 elif hasattr(filename, "read") or hasattr(filename, "write"):
60 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61 else:
62 raise TypeError("filename must be a str or bytes object, or a file")
63
Nadeem Vawda7e126202012-05-06 15:04:01 +020064 if "t" in mode:
65 return io.TextIOWrapper(binary_file, encoding, errors, newline)
66 else:
67 return binary_file
68
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000069def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000070 # The L format writes the bit pattern correctly whether signed
71 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000072 output.write(struct.pack("<L", value))
73
Antoine Pitrou7b969842010-09-23 16:22:51 +000074class _PaddedFile:
75 """Minimal read-only file object that prepends a string to the contents
76 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
77 essential functionality."""
78
79 def __init__(self, f, prepend=b''):
80 self._buffer = prepend
81 self._length = len(prepend)
82 self.file = f
83 self._read = 0
84
85 def read(self, size):
86 if self._read is None:
87 return self.file.read(size)
88 if self._read + size <= self._length:
89 read = self._read
90 self._read += size
91 return self._buffer[read:self._read]
92 else:
93 read = self._read
94 self._read = None
95 return self._buffer[read:] + \
96 self.file.read(size-self._length+read)
97
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020098 def prepend(self, prepend=b''):
Antoine Pitrou7b969842010-09-23 16:22:51 +000099 if self._read is None:
100 self._buffer = prepend
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200101 else: # Assume data was read since the last prepend() call
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102 self._read -= len(prepend)
103 return
Antoine Pitrou7b969842010-09-23 16:22:51 +0000104 self._length = len(self._buffer)
105 self._read = 0
106
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200107 def seek(self, off):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000108 self._read = None
109 self._buffer = None
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200110 return self.file.seek(off)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000111
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200112 def seekable(self):
113 return True # Allows fast-forwarding even in unseekable streams
Antoine Pitrou7b969842010-09-23 16:22:51 +0000114
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200115class GzipFile(_compression.BaseStream):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000116 """The GzipFile class simulates most of the methods of a file object with
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200117 the exception of the truncate() method.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000118
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200119 This class only supports opening files in binary mode. If you need to open a
Nadeem Vawda83a4dd32012-06-30 13:34:28 +0200120 compressed file in text mode, use the gzip.open() function.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200121
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000122 """
Guido van Rossum15262191997-04-30 16:04:57 +0000123
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200124 # Overridden with internal file object to be closed, if only a filename
125 # is passed in
Guido van Rossum68de3791997-07-19 20:22:23 +0000126 myfileobj = None
127
Tim Peters07e99cb2001-01-14 23:47:14 +0000128 def __init__(self, filename=None, mode=None,
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100129 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000130 """Constructor for the GzipFile class.
131
132 At least one of fileobj and filename must be given a
133 non-trivial value.
134
135 The new class instance is based on fileobj, which can be a regular
Serhiy Storchaka50254c52013-08-29 11:35:43 +0300136 file, an io.BytesIO object, or any other object which simulates a file.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000137 It defaults to None, in which case filename is opened to provide
138 a file object.
139
140 When fileobj is not None, the filename argument is only used to be
Martin Panter8f265652016-04-19 04:03:41 +0000141 included in the gzip file header, which may include the original
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000142 filename of the uncompressed file. It defaults to the filename of
143 fileobj, if discernible; otherwise, it defaults to the empty string,
144 and in this case the original filename is not included in the header.
145
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200146 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
147 'xb' depending on whether the file will be read or written. The default
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000148 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200149 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200150 'wb', 'a' and 'ab', and 'x' and 'xb'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000151
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100152 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000153 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100154 and 9 is slowest and produces the most compression. 0 is no compression
155 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000156
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000157 The mtime argument is an optional numeric timestamp to be written
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200158 to the last modification time field in the stream when compressing.
159 If omitted or None, the current time is used.
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000160
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000161 """
162
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200163 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200164 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000165 if mode and 'b' not in mode:
166 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000167 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000168 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000169 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200170 filename = getattr(fileobj, 'name', '')
171 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200172 filename = ''
Berker Peksag03020cf2016-10-02 13:47:58 +0300173 else:
174 filename = os.fspath(filename)
Guido van Rossum68de3791997-07-19 20:22:23 +0000175 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200176 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000177
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200178 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000179 self.mode = READ
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200180 raw = _GzipReader(fileobj)
181 self._buffer = io.BufferedReader(raw)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000182 self.name = filename
Guido van Rossum15262191997-04-30 16:04:57 +0000183
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200184 elif mode.startswith(('w', 'a', 'x')):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 self.mode = WRITE
186 self._init_write(filename)
187 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000188 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000189 -zlib.MAX_WBITS,
190 zlib.DEF_MEM_LEVEL,
191 0)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200192 self._write_mtime = mtime
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000193 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200194 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000195
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000196 self.fileobj = fileobj
Guido van Rossum15262191997-04-30 16:04:57 +0000197
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000198 if self.mode == WRITE:
199 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000200
Thomas Wouterscf297e42007-02-23 15:07:44 +0000201 @property
202 def filename(self):
203 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000204 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000205 if self.mode == WRITE and self.name[-3:] != ".gz":
206 return self.name + ".gz"
207 return self.name
208
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200209 @property
210 def mtime(self):
211 """Last modification time read from stream, or None"""
212 return self._buffer.raw._last_mtime
213
Guido van Rossum15262191997-04-30 16:04:57 +0000214 def __repr__(self):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200215 s = repr(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000216 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000217
218 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000219 self.name = filename
Martin Panterb82032f2015-12-11 05:19:29 +0000220 self.crc = zlib.crc32(b"")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000221 self.size = 0
222 self.writebuf = []
223 self.bufsize = 0
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200224 self.offset = 0 # Current file offset for seek(), tell(), etc
Guido van Rossum15262191997-04-30 16:04:57 +0000225
226 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000227 self.fileobj.write(b'\037\213') # magic header
228 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000229 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000230 # RFC 1952 requires the FNAME field to be Latin-1. Do not
231 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000232 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200233 if not isinstance(fname, bytes):
234 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000235 if fname.endswith(b'.gz'):
236 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000237 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000238 fname = b''
239 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000240 if fname:
241 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000242 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200243 mtime = self._write_mtime
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000244 if mtime is None:
245 mtime = time.time()
246 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000247 self.fileobj.write(b'\002')
248 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000249 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000250 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000251
Guido van Rossum15262191997-04-30 16:04:57 +0000252 def write(self,data):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200253 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000254 if self.mode != WRITE:
255 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200256 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000257
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000258 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000259 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000260
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200261 if isinstance(data, bytes):
262 length = len(data)
263 else:
264 # accept any data that supports the buffer protocol
265 data = memoryview(data)
266 length = data.nbytes
Antoine Pitroub1f88352010-01-03 22:37:40 +0000267
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200268 if length > 0:
269 self.fileobj.write(self.compress.compress(data))
270 self.size += length
Martin Panterb82032f2015-12-11 05:19:29 +0000271 self.crc = zlib.crc32(data, self.crc)
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200272 self.offset += length
Guido van Rossum15262191997-04-30 16:04:57 +0000273
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200274 return length
Antoine Pitroub1f88352010-01-03 22:37:40 +0000275
Guido van Rossum56068012000-02-02 16:51:06 +0000276 def read(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200277 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000278 if self.mode != READ:
279 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200280 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200281 return self._buffer.read(size)
Guido van Rossum15262191997-04-30 16:04:57 +0000282
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200283 def read1(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200284 """Implements BufferedIOBase.read1()
285
286 Reads up to a buffer's worth of data is size is negative."""
287 self._check_not_closed()
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200288 if self.mode != READ:
289 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200290 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200291
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200292 if size < 0:
293 size = io.DEFAULT_BUFFER_SIZE
294 return self._buffer.read1(size)
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200295
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000296 def peek(self, n):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200297 self._check_not_closed()
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000298 if self.mode != READ:
299 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200300 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200301 return self._buffer.peek(n)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000302
Antoine Pitroub1f88352010-01-03 22:37:40 +0000303 @property
304 def closed(self):
305 return self.fileobj is None
306
Guido van Rossum15262191997-04-30 16:04:57 +0000307 def close(self):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300308 fileobj = self.fileobj
309 if fileobj is None:
Georg Brandlb533e262008-05-25 18:19:30 +0000310 return
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300311 self.fileobj = None
312 try:
313 if self.mode == WRITE:
314 fileobj.write(self.compress.flush())
315 write32u(fileobj, self.crc)
Victor Stinner8c663fd2017-11-08 14:44:44 -0800316 # self.size may exceed 2 GiB, or even 4 GiB
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300317 write32u(fileobj, self.size & 0xffffffff)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200318 elif self.mode == READ:
319 self._buffer.close()
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300320 finally:
321 myfileobj = self.myfileobj
322 if myfileobj:
323 self.myfileobj = None
324 myfileobj.close()
Guido van Rossum15262191997-04-30 16:04:57 +0000325
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000326 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200327 self._check_not_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000328 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000329 # Ensure the compressor's buffer is flushed
330 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000331 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000332
Tim Peters5cfb05e2004-07-27 21:02:02 +0000333 def fileno(self):
334 """Invoke the underlying file object's fileno() method.
335
336 This will raise AttributeError if the underlying file object
337 doesn't support fileno().
338 """
339 return self.fileobj.fileno()
340
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000341 def rewind(self):
342 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000343 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000344 if self.mode != READ:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200345 raise OSError("Can't rewind in write mode")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200346 self._buffer.seek(0)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000347
Antoine Pitroub1f88352010-01-03 22:37:40 +0000348 def readable(self):
349 return self.mode == READ
350
351 def writable(self):
352 return self.mode == WRITE
353
354 def seekable(self):
355 return True
356
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200357 def seek(self, offset, whence=io.SEEK_SET):
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000358 if self.mode == WRITE:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200359 if whence != io.SEEK_SET:
360 if whence == io.SEEK_CUR:
361 offset = self.offset + offset
362 else:
363 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000364 if offset < self.offset:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200365 raise OSError('Negative seek in write mode')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000366 count = offset - self.offset
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300367 chunk = b'\0' * 1024
Tim Petersfb0ea522002-11-04 19:50:11 +0000368 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000369 self.write(chunk)
Serhiy Storchaka5f1a5182016-09-11 14:41:02 +0300370 self.write(b'\0' * (count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000371 elif self.mode == READ:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200372 self._check_not_closed()
373 return self._buffer.seek(offset, whence)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000374
Antoine Pitroub1f88352010-01-03 22:37:40 +0000375 return self.offset
376
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000377 def readline(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200378 self._check_not_closed()
379 return self._buffer.readline(size)
380
381
382class _GzipReader(_compression.DecompressReader):
383 def __init__(self, fp):
384 super().__init__(_PaddedFile(fp), zlib.decompressobj,
385 wbits=-zlib.MAX_WBITS)
386 # Set flag indicating start of a new member
387 self._new_member = True
388 self._last_mtime = None
389
390 def _init_read(self):
Martin Panterb82032f2015-12-11 05:19:29 +0000391 self._crc = zlib.crc32(b"")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200392 self._stream_size = 0 # Decompressed size of unconcatenated stream
393
394 def _read_exact(self, n):
395 '''Read exactly *n* bytes from `self._fp`
396
397 This method is required because self._fp may be unbuffered,
398 i.e. return short reads.
399 '''
400
401 data = self._fp.read(n)
402 while len(data) < n:
403 b = self._fp.read(n - len(data))
404 if not b:
405 raise EOFError("Compressed file ended before the "
406 "end-of-stream marker was reached")
407 data += b
408 return data
409
410 def _read_gzip_header(self):
411 magic = self._fp.read(2)
412 if magic == b'':
413 return False
414
415 if magic != b'\037\213':
416 raise OSError('Not a gzipped file (%r)' % magic)
417
418 (method, flag,
419 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
420 if method != 8:
421 raise OSError('Unknown compression method')
422
423 if flag & FEXTRA:
424 # Read & discard the extra field, if present
425 extra_len, = struct.unpack("<H", self._read_exact(2))
426 self._read_exact(extra_len)
427 if flag & FNAME:
428 # Read and discard a null-terminated string containing the filename
429 while True:
430 s = self._fp.read(1)
431 if not s or s==b'\000':
432 break
433 if flag & FCOMMENT:
434 # Read and discard a null-terminated string containing a comment
435 while True:
436 s = self._fp.read(1)
437 if not s or s==b'\000':
438 break
439 if flag & FHCRC:
440 self._read_exact(2) # Read & discard the 16-bit header CRC
441 return True
442
443 def read(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000444 if size < 0:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200445 return self.readall()
446 # size=0 is special because decompress(max_length=0) is not supported
447 if not size:
448 return b""
Antoine Pitroub1f88352010-01-03 22:37:40 +0000449
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200450 # For certain input data, a single
451 # call to decompress() may not return
452 # any data. In this case, retry until we get some data or reach EOF.
453 while True:
454 if self._decompressor.eof:
455 # Ending case: we've come to the end of a member in the file,
456 # so finish up this member, and read a new gzip header.
457 # Check the CRC and file size, and set the flag so we read
458 # a new member
459 self._read_eof()
460 self._new_member = True
461 self._decompressor = self._decomp_factory(
462 **self._decomp_args)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200464 if self._new_member:
465 # If the _new_member flag is set, we have to
466 # jump to the next member, if there is one.
467 self._init_read()
468 if not self._read_gzip_header():
469 self._size = self._pos
470 return b""
471 self._new_member = False
Guido van Rossum15262191997-04-30 16:04:57 +0000472
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200473 # Read a chunk of data from the file
474 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
475
476 uncompress = self._decompressor.decompress(buf, size)
477 if self._decompressor.unconsumed_tail != b"":
478 self._fp.prepend(self._decompressor.unconsumed_tail)
479 elif self._decompressor.unused_data != b"":
480 # Prepend the already read bytes to the fileobj so they can
481 # be seen by _read_eof() and _read_gzip_header()
482 self._fp.prepend(self._decompressor.unused_data)
483
484 if uncompress != b"":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000485 break
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200486 if buf == b"":
487 raise EOFError("Compressed file ended before the "
488 "end-of-stream marker was reached")
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000489
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200490 self._add_read_data( uncompress )
491 self._pos += len(uncompress)
492 return uncompress
Tim Peters07e99cb2001-01-14 23:47:14 +0000493
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200494 def _add_read_data(self, data):
Martin Panterb82032f2015-12-11 05:19:29 +0000495 self._crc = zlib.crc32(data, self._crc)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200496 self._stream_size = self._stream_size + len(data)
497
498 def _read_eof(self):
499 # We've read to the end of the file
500 # We check the that the computed CRC and size of the
501 # uncompressed data matches the stored values. Note that the size
502 # stored is the true file size mod 2**32.
503 crc32, isize = struct.unpack("<II", self._read_exact(8))
504 if crc32 != self._crc:
505 raise OSError("CRC check failed %s != %s" % (hex(crc32),
506 hex(self._crc)))
507 elif isize != (self._stream_size & 0xffffffff):
508 raise OSError("Incorrect length of data produced")
509
510 # Gzip files can be padded with zeroes and still have archives.
511 # Consume all zero bytes and set the file position to the first
512 # non-zero byte. See http://www.gzip.org/#faq8
513 c = b"\x00"
514 while c == b"\x00":
515 c = self._fp.read(1)
516 if c:
517 self._fp.prepend(c)
518
519 def _rewind(self):
520 super()._rewind()
521 self._new_member = True
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000522
guoci0e7497c2018-11-07 04:50:23 -0500523def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000524 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100525 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000526 """
527 buf = io.BytesIO()
guoci0e7497c2018-11-07 04:50:23 -0500528 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000529 f.write(data)
530 return buf.getvalue()
531
532def decompress(data):
533 """Decompress a gzip compressed string in one shot.
534 Return the decompressed string.
535 """
536 with GzipFile(fileobj=io.BytesIO(data)) as f:
537 return f.read()
538
539
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200540def main():
541 from argparse import ArgumentParser
542 parser = ArgumentParser(description=
543 "A simple command line interface for the gzip module: act like gzip, "
544 "but do not delete the input file.")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100545 group = parser.add_mutually_exclusive_group()
546 group.add_argument('--fast', action='store_true', help='compress faster')
547 group.add_argument('--best', action='store_true', help='compress better')
548 group.add_argument("-d", "--decompress", action="store_true",
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200549 help="act like gunzip instead of gzip")
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100550
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200551 parser.add_argument("args", nargs="*", default=["-"], metavar='file')
552 args = parser.parse_args()
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100553
554 compresslevel = _COMPRESS_LEVEL_TRADEOFF
555 if args.fast:
556 compresslevel = _COMPRESS_LEVEL_FAST
557 elif args.best:
558 compresslevel = _COMPRESS_LEVEL_BEST
559
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200560 for arg in args.args:
561 if args.decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000562 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000563 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
564 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000565 else:
566 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000567 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000568 continue
569 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000570 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000571 else:
572 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000573 f = sys.stdin.buffer
Stéphane Wirtel3e28eed2018-11-03 16:24:23 +0100574 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
575 compresslevel=compresslevel)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000576 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000577 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000578 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000579 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000580 chunk = f.read(1024)
581 if not chunk:
582 break
583 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200584 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000585 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200586 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000587 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000588
589if __name__ == '__main__':
Stéphane Wirtele8bbc522018-10-10 00:41:33 +0200590 main()