blob: deaf15db4f9953c0c42fb4c32db8c64943484a03 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Nadeem Vawda7e126202012-05-06 15:04:01 +020019def open(filename, mode="rb", compresslevel=9,
20 encoding=None, errors=None, newline=None):
21 """Open a gzip-compressed file in binary or text mode.
22
Nadeem Vawda68721012012-06-04 23:21:38 +020023 The filename argument can be an actual filename (a str or bytes object), or
24 an existing file object to read from or write to.
25
Nadeem Vawdaee1be992013-10-19 00:11:13 +020026 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
27 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
28 "rb", and the default compresslevel is 9.
Nadeem Vawda7e126202012-05-06 15:04:01 +020029
30 For binary mode, this function is equivalent to the GzipFile constructor:
31 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
32 and newline arguments must not be provided.
33
34 For text mode, a GzipFile object is created, and wrapped in an
35 io.TextIOWrapper instance with the specified encoding, error handling
36 behavior, and line ending(s).
37
38 """
39 if "t" in mode:
40 if "b" in mode:
41 raise ValueError("Invalid mode: %r" % (mode,))
42 else:
43 if encoding is not None:
44 raise ValueError("Argument 'encoding' not supported in binary mode")
45 if errors is not None:
46 raise ValueError("Argument 'errors' not supported in binary mode")
47 if newline is not None:
48 raise ValueError("Argument 'newline' not supported in binary mode")
Nadeem Vawda68721012012-06-04 23:21:38 +020049
50 gz_mode = mode.replace("t", "")
51 if isinstance(filename, (str, bytes)):
52 binary_file = GzipFile(filename, gz_mode, compresslevel)
53 elif hasattr(filename, "read") or hasattr(filename, "write"):
54 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
55 else:
56 raise TypeError("filename must be a str or bytes object, or a file")
57
Nadeem Vawda7e126202012-05-06 15:04:01 +020058 if "t" in mode:
59 return io.TextIOWrapper(binary_file, encoding, errors, newline)
60 else:
61 return binary_file
62
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000063def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000064 # The L format writes the bit pattern correctly whether signed
65 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000066 output.write(struct.pack("<L", value))
67
Antoine Pitrou7b969842010-09-23 16:22:51 +000068class _PaddedFile:
69 """Minimal read-only file object that prepends a string to the contents
70 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
71 essential functionality."""
72
73 def __init__(self, f, prepend=b''):
74 self._buffer = prepend
75 self._length = len(prepend)
76 self.file = f
77 self._read = 0
78
79 def read(self, size):
80 if self._read is None:
81 return self.file.read(size)
82 if self._read + size <= self._length:
83 read = self._read
84 self._read += size
85 return self._buffer[read:self._read]
86 else:
87 read = self._read
88 self._read = None
89 return self._buffer[read:] + \
90 self.file.read(size-self._length+read)
91
92 def prepend(self, prepend=b'', readprevious=False):
93 if self._read is None:
94 self._buffer = prepend
95 elif readprevious and len(prepend) <= self._read:
96 self._read -= len(prepend)
97 return
98 else:
Ned Deily61207392014-03-09 14:44:34 -070099 self._buffer = self._buffer[self._read:] + prepend
Antoine Pitrou7b969842010-09-23 16:22:51 +0000100 self._length = len(self._buffer)
101 self._read = 0
102
103 def unused(self):
104 if self._read is None:
105 return b''
106 return self._buffer[self._read:]
107
108 def seek(self, offset, whence=0):
109 # This is only ever called with offset=whence=0
110 if whence == 1 and self._read is not None:
111 if 0 <= offset + self._read <= self._length:
112 self._read += offset
113 return
114 else:
115 offset += self._length - self._read
116 self._read = None
117 self._buffer = None
118 return self.file.seek(offset, whence)
119
120 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +0000121 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000122
123
Antoine Pitroub1f88352010-01-03 22:37:40 +0000124class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000125 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000126 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000127
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200128 This class only supports opening files in binary mode. If you need to open a
Nadeem Vawda83a4dd32012-06-30 13:34:28 +0200129 compressed file in text mode, use the gzip.open() function.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200130
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000131 """
Guido van Rossum15262191997-04-30 16:04:57 +0000132
Guido van Rossum68de3791997-07-19 20:22:23 +0000133 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000134 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000135
Tim Peters07e99cb2001-01-14 23:47:14 +0000136 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000137 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000138 """Constructor for the GzipFile class.
139
140 At least one of fileobj and filename must be given a
141 non-trivial value.
142
143 The new class instance is based on fileobj, which can be a regular
Serhiy Storchaka50254c52013-08-29 11:35:43 +0300144 file, an io.BytesIO object, or any other object which simulates a file.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000145 It defaults to None, in which case filename is opened to provide
146 a file object.
147
148 When fileobj is not None, the filename argument is only used to be
149 included in the gzip file header, which may includes the original
150 filename of the uncompressed file. It defaults to the filename of
151 fileobj, if discernible; otherwise, it defaults to the empty string,
152 and in this case the original filename is not included in the header.
153
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200154 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
155 'xb' depending on whether the file will be read or written. The default
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000156 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200157 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200158 'wb', 'a' and 'ab', and 'x' and 'xb'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000159
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100160 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000161 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100162 and 9 is slowest and produces the most compression. 0 is no compression
163 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000164
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000165 The mtime argument is an optional numeric timestamp to be written
166 to the stream when compressing. All gzip compressed streams
167 are required to contain a timestamp. If omitted or None, the
168 current time is used. This module ignores the timestamp when
169 decompressing; however, some programs, such as gunzip, make use
170 of it. The format of the timestamp is the same as that of the
171 return value of time.time() and of the st_mtime member of the
172 object returned by os.stat().
173
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000174 """
175
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200176 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200177 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000178 if mode and 'b' not in mode:
179 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000180 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000181 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000182 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200183 filename = getattr(fileobj, 'name', '')
184 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200185 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000186 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200187 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000188
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200189 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000190 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000191 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000192 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000193 # Buffer data read from gzip file. extrastart is offset in
194 # stream where buffer starts. extrasize is number of
195 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000196 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000197 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000198 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000199 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000200 # Starts small, scales exponentially
201 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000202 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000203
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200204 elif mode.startswith(('w', 'a', 'x')):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000205 self.mode = WRITE
206 self._init_write(filename)
207 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000208 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000209 -zlib.MAX_WBITS,
210 zlib.DEF_MEM_LEVEL,
211 0)
212 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200213 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000214
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000215 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000216 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000217 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000218
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000219 if self.mode == WRITE:
220 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000221
Thomas Wouterscf297e42007-02-23 15:07:44 +0000222 @property
223 def filename(self):
224 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000225 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000226 if self.mode == WRITE and self.name[-3:] != ".gz":
227 return self.name + ".gz"
228 return self.name
229
Guido van Rossum15262191997-04-30 16:04:57 +0000230 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000231 fileobj = self.fileobj
232 if isinstance(fileobj, _PaddedFile):
233 fileobj = fileobj.file
234 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000235 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000236
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000237 def _check_closed(self):
238 """Raises a ValueError if the underlying file object has been closed.
239
240 """
241 if self.closed:
242 raise ValueError('I/O operation on closed file.')
243
Guido van Rossum15262191997-04-30 16:04:57 +0000244 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000245 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000246 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000247 self.size = 0
248 self.writebuf = []
249 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000250
251 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000252 self.fileobj.write(b'\037\213') # magic header
253 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000254 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000255 # RFC 1952 requires the FNAME field to be Latin-1. Do not
256 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000257 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200258 if not isinstance(fname, bytes):
259 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000260 if fname.endswith(b'.gz'):
261 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000262 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000263 fname = b''
264 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000265 if fname:
266 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000267 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000268 mtime = self.mtime
269 if mtime is None:
270 mtime = time.time()
271 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000272 self.fileobj.write(b'\002')
273 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000274 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000275 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000276
277 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000278 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000279 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000280
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200281 def _read_exact(self, n):
282 data = self.fileobj.read(n)
283 while len(data) < n:
284 b = self.fileobj.read(n - len(data))
285 if not b:
286 raise EOFError("Compressed file ended before the "
287 "end-of-stream marker was reached")
288 data += b
289 return data
290
Guido van Rossum15262191997-04-30 16:04:57 +0000291 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000292 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000293 if magic == b'':
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200294 return False
Antoine Pitrou7b969842010-09-23 16:22:51 +0000295
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000296 if magic != b'\037\213':
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200297 raise OSError('Not a gzipped file')
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200298
299 method, flag, self.mtime = struct.unpack("<BBIxx", self._read_exact(8))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000300 if method != 8:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200301 raise OSError('Unknown compression method')
Guido van Rossum15262191997-04-30 16:04:57 +0000302
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 if flag & FEXTRA:
304 # Read & discard the extra field, if present
Serhiy Storchaka7e69f002013-04-08 22:35:02 +0300305 extra_len, = struct.unpack("<H", self._read_exact(2))
306 self._read_exact(extra_len)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000307 if flag & FNAME:
308 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000309 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000310 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000311 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000312 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000313 if flag & FCOMMENT:
314 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000315 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000316 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000317 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000318 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 if flag & FHCRC:
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200320 self._read_exact(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000321
Antoine Pitrou7b969842010-09-23 16:22:51 +0000322 unused = self.fileobj.unused()
323 if unused:
324 uncompress = self.decompress.decompress(unused)
325 self._add_read_data(uncompress)
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200326 return True
Antoine Pitrou7b969842010-09-23 16:22:51 +0000327
Guido van Rossum15262191997-04-30 16:04:57 +0000328 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000329 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000330 if self.mode != WRITE:
331 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200332 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000333
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000334 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000335 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000336
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200337 if isinstance(data, bytes):
338 length = len(data)
339 else:
340 # accept any data that supports the buffer protocol
341 data = memoryview(data)
342 length = data.nbytes
Antoine Pitroub1f88352010-01-03 22:37:40 +0000343
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200344 if length > 0:
345 self.fileobj.write(self.compress.compress(data))
346 self.size += length
Christian Heimesfe337bf2008-03-23 21:54:12 +0000347 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200348 self.offset += length
Guido van Rossum15262191997-04-30 16:04:57 +0000349
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200350 return length
Antoine Pitroub1f88352010-01-03 22:37:40 +0000351
Guido van Rossum56068012000-02-02 16:51:06 +0000352 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000353 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000354 if self.mode != READ:
355 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200356 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000357
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000358 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000359 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000360
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000361 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000362 if size < 0: # get the whole thing
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200363 while self._read(readsize):
364 readsize = min(self.max_read_chunk, readsize * 2)
365 size = self.extrasize
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000366 else: # just get some more of it
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200367 while size > self.extrasize:
368 if not self._read(readsize):
369 if size > self.extrasize:
370 size = self.extrasize
371 break
372 readsize = min(self.max_read_chunk, readsize * 2)
Tim Peters07e99cb2001-01-14 23:47:14 +0000373
Antoine Pitroub1f88352010-01-03 22:37:40 +0000374 offset = self.offset - self.extrastart
375 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000376 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000377
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000378 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000379 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000380
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200381 def read1(self, size=-1):
382 self._check_closed()
383 if self.mode != READ:
384 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200385 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200386
387 if self.extrasize <= 0 and self.fileobj is None:
388 return b''
389
Serhiy Storchaka57f9b7a2013-01-22 17:07:49 +0200390 # For certain input data, a single call to _read() may not return
391 # any data. In this case, retry until we get some data or reach EOF.
392 while self.extrasize <= 0 and self._read():
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200393 pass
394 if size < 0 or size > self.extrasize:
395 size = self.extrasize
396
397 offset = self.offset - self.extrastart
398 chunk = self.extrabuf[offset: offset + size]
399 self.extrasize -= size
400 self.offset += size
401 return chunk
402
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000403 def peek(self, n):
404 if self.mode != READ:
405 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200406 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000407
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000408 # Do not return ridiculously small buffers, for one common idiom
409 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000410 if n < 100:
411 n = 100
412 if self.extrasize == 0:
413 if self.fileobj is None:
414 return b''
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200415 # Ensure that we don't return b"" if we haven't reached EOF.
416 # 1024 is the same buffering heuristic used in read()
417 while self.extrasize == 0 and self._read(max(n, 1024)):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000418 pass
419 offset = self.offset - self.extrastart
420 remaining = self.extrasize
421 assert remaining == len(self.extrabuf) - offset
422 return self.extrabuf[offset:offset + n]
423
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000424 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000425 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000426 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000427
428 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000429 if self.fileobj is None:
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200430 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000431
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000432 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000433 # If the _new_member flag is set, we have to
434 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000435 self._init_read()
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200436 if not self._read_gzip_header():
437 return False
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000438 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000439 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000440
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000441 # Read a chunk of data from the file
442 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000443
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000444 # If the EOF has been reached, flush the decompression object
445 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000446
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000447 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000448 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000449 # Prepend the already read bytes to the fileobj to they can be
450 # seen by _read_eof()
451 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000452 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000453 self._add_read_data( uncompress )
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200454 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000455
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000456 uncompress = self.decompress.decompress(buf)
457 self._add_read_data( uncompress )
458
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000459 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000460 # Ending case: we've come to the end of a member in the file,
461 # so seek back to the start of the unused data, finish up
462 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000463 # Prepend the already read bytes to the fileobj to they can be
464 # seen by _read_eof() and _read_gzip_header()
465 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000466 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000467 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000468 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000469 self._new_member = True
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200470 return True
Tim Peters07e99cb2001-01-14 23:47:14 +0000471
472 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000473 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000474 offset = self.offset - self.extrastart
475 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000476 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000477 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000478 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000479
480 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000481 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000482 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000483 # uncompressed data matches the stored values. Note that the size
484 # stored is the true file size mod 2**32.
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200485 crc32, isize = struct.unpack("<II", self._read_exact(8))
Christian Heimesfe337bf2008-03-23 21:54:12 +0000486 if crc32 != self.crc:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200487 raise OSError("CRC check failed %s != %s" % (hex(crc32),
Christian Heimesfe337bf2008-03-23 21:54:12 +0000488 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000489 elif isize != (self.size & 0xffffffff):
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200490 raise OSError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000491
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000492 # Gzip files can be padded with zeroes and still have archives.
493 # Consume all zero bytes and set the file position to the first
494 # non-zero byte. See http://www.gzip.org/#faq8
495 c = b"\x00"
496 while c == b"\x00":
497 c = self.fileobj.read(1)
498 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000499 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000500
Antoine Pitroub1f88352010-01-03 22:37:40 +0000501 @property
502 def closed(self):
503 return self.fileobj is None
504
Guido van Rossum15262191997-04-30 16:04:57 +0000505 def close(self):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300506 fileobj = self.fileobj
507 if fileobj is None:
Georg Brandlb533e262008-05-25 18:19:30 +0000508 return
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300509 self.fileobj = None
510 try:
511 if self.mode == WRITE:
512 fileobj.write(self.compress.flush())
513 write32u(fileobj, self.crc)
514 # self.size may exceed 2GB, or even 4GB
515 write32u(fileobj, self.size & 0xffffffff)
516 finally:
517 myfileobj = self.myfileobj
518 if myfileobj:
519 self.myfileobj = None
520 myfileobj.close()
Guido van Rossum15262191997-04-30 16:04:57 +0000521
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000522 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000523 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000524 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000525 # Ensure the compressor's buffer is flushed
526 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000527 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000528
Tim Peters5cfb05e2004-07-27 21:02:02 +0000529 def fileno(self):
530 """Invoke the underlying file object's fileno() method.
531
532 This will raise AttributeError if the underlying file object
533 doesn't support fileno().
534 """
535 return self.fileobj.fileno()
536
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000537 def rewind(self):
538 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000539 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000540 if self.mode != READ:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200541 raise OSError("Can't rewind in write mode")
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000542 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000543 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000544 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000545 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000546 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000547 self.offset = 0
548
Antoine Pitroub1f88352010-01-03 22:37:40 +0000549 def readable(self):
550 return self.mode == READ
551
552 def writable(self):
553 return self.mode == WRITE
554
555 def seekable(self):
556 return True
557
Thomas Wouters89f507f2006-12-13 04:49:30 +0000558 def seek(self, offset, whence=0):
559 if whence:
560 if whence == 1:
561 offset = self.offset + offset
562 else:
563 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000564 if self.mode == WRITE:
565 if offset < self.offset:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200566 raise OSError('Negative seek in write mode')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000567 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000568 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000569 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000570 self.write(chunk)
571 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000572 elif self.mode == READ:
573 if offset < self.offset:
574 # for negative seek, rewind and do positive seek
575 self.rewind()
576 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000577 for i in range(count // 1024):
578 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000579 self.read(count % 1024)
580
Antoine Pitroub1f88352010-01-03 22:37:40 +0000581 return self.offset
582
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000583 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000584 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000585 # Shortcut common case - newline found in buffer.
586 offset = self.offset - self.extrastart
587 i = self.extrabuf.find(b'\n', offset) + 1
588 if i > 0:
589 self.extrasize -= i - offset
590 self.offset += i - offset
591 return self.extrabuf[offset: i]
592
Christian Heimesa37d4c62007-12-04 23:02:19 +0000593 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000594 readsize = self.min_readsize
595 else:
596 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000597 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000598 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000599 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000600 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000601
602 # We set i=size to break out of the loop under two
603 # conditions: 1) there's no newline, and the chunk is
604 # larger than size, or 2) there is a newline, but the
605 # resulting line would be longer than 'size'.
606 if (size <= i) or (i == -1 and len(c) > size):
607 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000608
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000609 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000610 bufs.append(c[:i + 1]) # Add portion of last chunk
611 self._unread(c[i + 1:]) # Push back rest of chunk
612 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000613
614 # Append chunk to list, decrease 'size',
615 bufs.append(c)
616 size = size - len(c)
617 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000618 if readsize > self.min_readsize:
619 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000620 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000621
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000622
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000623def compress(data, compresslevel=9):
624 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100625 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000626 """
627 buf = io.BytesIO()
628 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
629 f.write(data)
630 return buf.getvalue()
631
632def decompress(data):
633 """Decompress a gzip compressed string in one shot.
634 Return the decompressed string.
635 """
636 with GzipFile(fileobj=io.BytesIO(data)) as f:
637 return f.read()
638
639
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000640def _test():
641 # Act like gzip; with -d, act like gunzip.
642 # The input file is not deleted, however, nor are any other gzip
643 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000644 args = sys.argv[1:]
645 decompress = args and args[0] == "-d"
646 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000647 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000648 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000649 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000650 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000651 if decompress:
652 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000653 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
654 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000655 else:
656 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000657 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000658 continue
659 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000660 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000661 else:
662 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000663 f = sys.stdin.buffer
664 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000665 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000666 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000667 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000668 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000669 chunk = f.read(1024)
670 if not chunk:
671 break
672 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200673 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000674 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200675 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000676 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000677
678if __name__ == '__main__':
679 _test()