blob: 5bcfe6123afe0f480e30b3a9fc7d159f05cdf702 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Nadeem Vawda7e126202012-05-06 15:04:01 +020019def open(filename, mode="rb", compresslevel=9,
20 encoding=None, errors=None, newline=None):
21 """Open a gzip-compressed file in binary or text mode.
22
Nadeem Vawda68721012012-06-04 23:21:38 +020023 The filename argument can be an actual filename (a str or bytes object), or
24 an existing file object to read from or write to.
25
Nadeem Vawda7e126202012-05-06 15:04:01 +020026 The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode,
27 or "rt", "wt" or "at" for text mode. The default mode is "rb", and the
28 default compresslevel is 9.
29
30 For binary mode, this function is equivalent to the GzipFile constructor:
31 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
32 and newline arguments must not be provided.
33
34 For text mode, a GzipFile object is created, and wrapped in an
35 io.TextIOWrapper instance with the specified encoding, error handling
36 behavior, and line ending(s).
37
38 """
39 if "t" in mode:
40 if "b" in mode:
41 raise ValueError("Invalid mode: %r" % (mode,))
42 else:
43 if encoding is not None:
44 raise ValueError("Argument 'encoding' not supported in binary mode")
45 if errors is not None:
46 raise ValueError("Argument 'errors' not supported in binary mode")
47 if newline is not None:
48 raise ValueError("Argument 'newline' not supported in binary mode")
Nadeem Vawda68721012012-06-04 23:21:38 +020049
50 gz_mode = mode.replace("t", "")
51 if isinstance(filename, (str, bytes)):
52 binary_file = GzipFile(filename, gz_mode, compresslevel)
53 elif hasattr(filename, "read") or hasattr(filename, "write"):
54 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
55 else:
56 raise TypeError("filename must be a str or bytes object, or a file")
57
Nadeem Vawda7e126202012-05-06 15:04:01 +020058 if "t" in mode:
59 return io.TextIOWrapper(binary_file, encoding, errors, newline)
60 else:
61 return binary_file
62
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000063def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000064 # The L format writes the bit pattern correctly whether signed
65 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000066 output.write(struct.pack("<L", value))
67
Guido van Rossum15262191997-04-30 16:04:57 +000068def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000069 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000070
Antoine Pitrou7b969842010-09-23 16:22:51 +000071class _PaddedFile:
72 """Minimal read-only file object that prepends a string to the contents
73 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
74 essential functionality."""
75
76 def __init__(self, f, prepend=b''):
77 self._buffer = prepend
78 self._length = len(prepend)
79 self.file = f
80 self._read = 0
81
82 def read(self, size):
83 if self._read is None:
84 return self.file.read(size)
85 if self._read + size <= self._length:
86 read = self._read
87 self._read += size
88 return self._buffer[read:self._read]
89 else:
90 read = self._read
91 self._read = None
92 return self._buffer[read:] + \
93 self.file.read(size-self._length+read)
94
95 def prepend(self, prepend=b'', readprevious=False):
96 if self._read is None:
97 self._buffer = prepend
98 elif readprevious and len(prepend) <= self._read:
99 self._read -= len(prepend)
100 return
101 else:
102 self._buffer = self._buffer[read:] + prepend
103 self._length = len(self._buffer)
104 self._read = 0
105
106 def unused(self):
107 if self._read is None:
108 return b''
109 return self._buffer[self._read:]
110
111 def seek(self, offset, whence=0):
112 # This is only ever called with offset=whence=0
113 if whence == 1 and self._read is not None:
114 if 0 <= offset + self._read <= self._length:
115 self._read += offset
116 return
117 else:
118 offset += self._length - self._read
119 self._read = None
120 self._buffer = None
121 return self.file.seek(offset, whence)
122
123 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +0000124 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000125
126
Antoine Pitroub1f88352010-01-03 22:37:40 +0000127class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000128 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000129 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000130
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200131 This class only supports opening files in binary mode. If you need to open a
Nadeem Vawda83a4dd32012-06-30 13:34:28 +0200132 compressed file in text mode, use the gzip.open() function.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200133
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000134 """
Guido van Rossum15262191997-04-30 16:04:57 +0000135
Guido van Rossum68de3791997-07-19 20:22:23 +0000136 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000137 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000138
Tim Peters07e99cb2001-01-14 23:47:14 +0000139 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000140 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000141 """Constructor for the GzipFile class.
142
143 At least one of fileobj and filename must be given a
144 non-trivial value.
145
146 The new class instance is based on fileobj, which can be a regular
147 file, a StringIO object, or any other object which simulates a file.
148 It defaults to None, in which case filename is opened to provide
149 a file object.
150
151 When fileobj is not None, the filename argument is only used to be
152 included in the gzip file header, which may includes the original
153 filename of the uncompressed file. It defaults to the filename of
154 fileobj, if discernible; otherwise, it defaults to the empty string,
155 and in this case the original filename is not included in the header.
156
157 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
158 depending on whether the file will be read or written. The default
159 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200160 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
161 'wb', and 'a' and 'ab'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000162
163 The compresslevel argument is an integer from 1 to 9 controlling the
164 level of compression; 1 is fastest and produces the least compression,
165 and 9 is slowest and produces the most compression. The default is 9.
166
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000167 The mtime argument is an optional numeric timestamp to be written
168 to the stream when compressing. All gzip compressed streams
169 are required to contain a timestamp. If omitted or None, the
170 current time is used. This module ignores the timestamp when
171 decompressing; however, some programs, such as gunzip, make use
172 of it. The format of the timestamp is the same as that of the
173 return value of time.time() and of the st_mtime member of the
174 object returned by os.stat().
175
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000176 """
177
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200178 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200179 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000180 if mode and 'b' not in mode:
181 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000182 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000183 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000184 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200185 filename = getattr(fileobj, 'name', '')
186 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200187 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000188 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200189 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000190
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200191 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000192 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000193 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000194 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000195 # Buffer data read from gzip file. extrastart is offset in
196 # stream where buffer starts. extrasize is number of
197 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000198 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000199 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000200 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000201 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000202 # Starts small, scales exponentially
203 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000204 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000205
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200206 elif mode.startswith(('w', 'a')):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000207 self.mode = WRITE
208 self._init_write(filename)
209 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000210 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 -zlib.MAX_WBITS,
212 zlib.DEF_MEM_LEVEL,
213 0)
214 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200215 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000216
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000217 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000218 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000219 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000220
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000221 if self.mode == WRITE:
222 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000223
Thomas Wouterscf297e42007-02-23 15:07:44 +0000224 @property
225 def filename(self):
226 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000227 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000228 if self.mode == WRITE and self.name[-3:] != ".gz":
229 return self.name + ".gz"
230 return self.name
231
Guido van Rossum15262191997-04-30 16:04:57 +0000232 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000233 fileobj = self.fileobj
234 if isinstance(fileobj, _PaddedFile):
235 fileobj = fileobj.file
236 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000237 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000238
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000239 def _check_closed(self):
240 """Raises a ValueError if the underlying file object has been closed.
241
242 """
243 if self.closed:
244 raise ValueError('I/O operation on closed file.')
245
Guido van Rossum15262191997-04-30 16:04:57 +0000246 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000247 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000248 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000249 self.size = 0
250 self.writebuf = []
251 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000252
253 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000254 self.fileobj.write(b'\037\213') # magic header
255 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000256 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000257 # RFC 1952 requires the FNAME field to be Latin-1. Do not
258 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000259 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200260 if not isinstance(fname, bytes):
261 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000262 if fname.endswith(b'.gz'):
263 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000264 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000265 fname = b''
266 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000267 if fname:
268 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000269 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000270 mtime = self.mtime
271 if mtime is None:
272 mtime = time.time()
273 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000274 self.fileobj.write(b'\002')
275 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000277 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000278
279 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000280 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000281 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000282
283 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000284 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000285 if magic == b'':
286 raise EOFError("Reached EOF")
287
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000288 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000289 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000290 method = ord( self.fileobj.read(1) )
291 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000292 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000293 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000294 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000295 # extraflag = self.fileobj.read(1)
296 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000297 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000298
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000299 if flag & FEXTRA:
300 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000301 xlen = ord(self.fileobj.read(1))
302 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 self.fileobj.read(xlen)
304 if flag & FNAME:
305 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000306 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000307 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000308 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000309 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000310 if flag & FCOMMENT:
311 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000312 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000313 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000314 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000315 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000316 if flag & FHCRC:
317 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000318
Antoine Pitrou7b969842010-09-23 16:22:51 +0000319 unused = self.fileobj.unused()
320 if unused:
321 uncompress = self.decompress.decompress(unused)
322 self._add_read_data(uncompress)
323
Guido van Rossum15262191997-04-30 16:04:57 +0000324 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000325 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000326 if self.mode != WRITE:
327 import errno
328 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000329
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000330 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000331 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000332
333 # Convert data type if called by io.BufferedWriter.
334 if isinstance(data, memoryview):
335 data = data.tobytes()
336
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000337 if len(data) > 0:
338 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000339 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000340 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000341 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000342
Antoine Pitroub1f88352010-01-03 22:37:40 +0000343 return len(data)
344
Guido van Rossum56068012000-02-02 16:51:06 +0000345 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000346 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000347 if self.mode != READ:
348 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000349 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000350
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000351 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000352 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000353
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000354 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000355 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000356 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000357 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000358 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000359 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000360 except EOFError:
361 size = self.extrasize
362 else: # just get some more of it
363 try:
364 while size > self.extrasize:
365 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000366 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000367 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000368 if size > self.extrasize:
369 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000370
Antoine Pitroub1f88352010-01-03 22:37:40 +0000371 offset = self.offset - self.extrastart
372 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000373 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000374
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000375 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000376 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000377
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200378 def read1(self, size=-1):
379 self._check_closed()
380 if self.mode != READ:
381 import errno
382 raise IOError(errno.EBADF, "read1() on write-only GzipFile object")
383
384 if self.extrasize <= 0 and self.fileobj is None:
385 return b''
386
387 try:
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200388 # For certain input data, a single call to _read() may not return
389 # any data. In this case, retry until we get some data or reach EOF.
390 while self.extrasize <= 0:
391 self._read()
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200392 except EOFError:
393 pass
394 if size < 0 or size > self.extrasize:
395 size = self.extrasize
396
397 offset = self.offset - self.extrastart
398 chunk = self.extrabuf[offset: offset + size]
399 self.extrasize -= size
400 self.offset += size
401 return chunk
402
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000403 def peek(self, n):
404 if self.mode != READ:
405 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000406 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000407
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000408 # Do not return ridiculously small buffers, for one common idiom
409 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000410 if n < 100:
411 n = 100
412 if self.extrasize == 0:
413 if self.fileobj is None:
414 return b''
415 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000416 # 1024 is the same buffering heuristic used in read()
417 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000418 except EOFError:
419 pass
420 offset = self.offset - self.extrastart
421 remaining = self.extrasize
422 assert remaining == len(self.extrabuf) - offset
423 return self.extrabuf[offset:offset + n]
424
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000425 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000426 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000427 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000428
429 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000430 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000431 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000432
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000433 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000434 # If the _new_member flag is set, we have to
435 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000436 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000437 self._read_gzip_header()
438 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000439 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000440
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000441 # Read a chunk of data from the file
442 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000443
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000444 # If the EOF has been reached, flush the decompression object
445 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000446
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000447 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000448 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000449 # Prepend the already read bytes to the fileobj to they can be
450 # seen by _read_eof()
451 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000452 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000453 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000454 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000455
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000456 uncompress = self.decompress.decompress(buf)
457 self._add_read_data( uncompress )
458
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000459 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000460 # Ending case: we've come to the end of a member in the file,
461 # so seek back to the start of the unused data, finish up
462 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000463 # Prepend the already read bytes to the fileobj to they can be
464 # seen by _read_eof() and _read_gzip_header()
465 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000466 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000467 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000468 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000469 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000470
471 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000472 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000473 offset = self.offset - self.extrastart
474 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000475 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000476 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000477 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000478
479 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000480 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000481 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000482 # uncompressed data matches the stored values. Note that the size
483 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000484 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000485 isize = read32(self.fileobj) # may exceed 2GB
486 if crc32 != self.crc:
487 raise IOError("CRC check failed %s != %s" % (hex(crc32),
488 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000489 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000490 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000491
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000492 # Gzip files can be padded with zeroes and still have archives.
493 # Consume all zero bytes and set the file position to the first
494 # non-zero byte. See http://www.gzip.org/#faq8
495 c = b"\x00"
496 while c == b"\x00":
497 c = self.fileobj.read(1)
498 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000499 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000500
Antoine Pitroub1f88352010-01-03 22:37:40 +0000501 @property
502 def closed(self):
503 return self.fileobj is None
504
Guido van Rossum15262191997-04-30 16:04:57 +0000505 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000506 if self.fileobj is None:
507 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000508 if self.mode == WRITE:
509 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000510 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000511 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000512 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000513 self.fileobj = None
514 elif self.mode == READ:
515 self.fileobj = None
516 if self.myfileobj:
517 self.myfileobj.close()
518 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000519
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000520 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000521 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000522 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000523 # Ensure the compressor's buffer is flushed
524 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000525 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000526
Tim Peters5cfb05e2004-07-27 21:02:02 +0000527 def fileno(self):
528 """Invoke the underlying file object's fileno() method.
529
530 This will raise AttributeError if the underlying file object
531 doesn't support fileno().
532 """
533 return self.fileobj.fileno()
534
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000535 def rewind(self):
536 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000537 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000538 if self.mode != READ:
539 raise IOError("Can't rewind in write mode")
540 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000541 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000542 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000543 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000544 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000545 self.offset = 0
546
Antoine Pitroub1f88352010-01-03 22:37:40 +0000547 def readable(self):
548 return self.mode == READ
549
550 def writable(self):
551 return self.mode == WRITE
552
553 def seekable(self):
554 return True
555
Thomas Wouters89f507f2006-12-13 04:49:30 +0000556 def seek(self, offset, whence=0):
557 if whence:
558 if whence == 1:
559 offset = self.offset + offset
560 else:
561 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000562 if self.mode == WRITE:
563 if offset < self.offset:
564 raise IOError('Negative seek in write mode')
565 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000566 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000567 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000568 self.write(chunk)
569 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000570 elif self.mode == READ:
571 if offset < self.offset:
572 # for negative seek, rewind and do positive seek
573 self.rewind()
574 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000575 for i in range(count // 1024):
576 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000577 self.read(count % 1024)
578
Antoine Pitroub1f88352010-01-03 22:37:40 +0000579 return self.offset
580
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000581 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000582 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000583 # Shortcut common case - newline found in buffer.
584 offset = self.offset - self.extrastart
585 i = self.extrabuf.find(b'\n', offset) + 1
586 if i > 0:
587 self.extrasize -= i - offset
588 self.offset += i - offset
589 return self.extrabuf[offset: i]
590
Christian Heimesa37d4c62007-12-04 23:02:19 +0000591 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000592 readsize = self.min_readsize
593 else:
594 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000595 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000596 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000597 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000598 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000599
600 # We set i=size to break out of the loop under two
601 # conditions: 1) there's no newline, and the chunk is
602 # larger than size, or 2) there is a newline, but the
603 # resulting line would be longer than 'size'.
604 if (size <= i) or (i == -1 and len(c) > size):
605 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000606
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000607 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000608 bufs.append(c[:i + 1]) # Add portion of last chunk
609 self._unread(c[i + 1:]) # Push back rest of chunk
610 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000611
612 # Append chunk to list, decrease 'size',
613 bufs.append(c)
614 size = size - len(c)
615 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000616 if readsize > self.min_readsize:
617 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000618 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000619
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000620
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000621def compress(data, compresslevel=9):
622 """Compress data in one shot and return the compressed string.
623 Optional argument is the compression level, in range of 1-9.
624 """
625 buf = io.BytesIO()
626 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
627 f.write(data)
628 return buf.getvalue()
629
630def decompress(data):
631 """Decompress a gzip compressed string in one shot.
632 Return the decompressed string.
633 """
634 with GzipFile(fileobj=io.BytesIO(data)) as f:
635 return f.read()
636
637
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000638def _test():
639 # Act like gzip; with -d, act like gunzip.
640 # The input file is not deleted, however, nor are any other gzip
641 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000642 args = sys.argv[1:]
643 decompress = args and args[0] == "-d"
644 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000645 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000646 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000647 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000648 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000649 if decompress:
650 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000651 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
652 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000653 else:
654 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000655 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000656 continue
657 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000658 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000659 else:
660 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000661 f = sys.stdin.buffer
662 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000663 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000664 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000665 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000666 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000667 chunk = f.read(1024)
668 if not chunk:
669 break
670 g.write(chunk)
671 if g is not sys.stdout:
672 g.close()
673 if f is not sys.stdin:
674 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000675
676if __name__ == '__main__':
677 _test()