blob: 76ace394f482ad193643bbe100c36c80c42f9732 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Antoine Pitrou673ddf92010-01-03 22:29:56 +000010import io
Guido van Rossum68de3791997-07-19 20:22:23 +000011import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +000012
Skip Montanaro2dd42762001-01-23 15:35:05 +000013__all__ = ["GzipFile","open"]
14
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Benjamin Peterson6e165b42013-05-11 13:17:13 -050024def read32(input):
25 return struct.unpack("<I", input.read(4))[0]
26
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou673ddf92010-01-03 22:29:56 +000036class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000037 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000038 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000039
40 """
Guido van Rossum15262191997-04-30 16:04:57 +000041
Guido van Rossum68de3791997-07-19 20:22:23 +000042 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000043 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +000044
Tim Peters07e99cb2001-01-14 23:47:14 +000045 def __init__(self, filename=None, mode=None,
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000046 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000047 """Constructor for the GzipFile class.
48
49 At least one of fileobj and filename must be given a
50 non-trivial value.
51
52 The new class instance is based on fileobj, which can be a regular
53 file, a StringIO object, or any other object which simulates a file.
54 It defaults to None, in which case filename is opened to provide
55 a file object.
56
57 When fileobj is not None, the filename argument is only used to be
Martin Panter5b48fa92016-04-19 04:03:41 +000058 included in the gzip file header, which may include the original
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000059 filename of the uncompressed file. It defaults to the filename of
60 fileobj, if discernible; otherwise, it defaults to the empty string,
61 and in this case the original filename is not included in the header.
62
63 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64 depending on whether the file will be read or written. The default
65 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66 Be aware that only the 'rb', 'ab', and 'wb' values should be used
67 for cross-platform portability.
68
Nadeem Vawda04050b82012-11-11 13:52:10 +010069 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000070 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda04050b82012-11-11 13:52:10 +010071 and 9 is slowest and produces the most compression. 0 is no compression
72 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000073
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000074 The mtime argument is an optional numeric timestamp to be written
75 to the stream when compressing. All gzip compressed streams
76 are required to contain a timestamp. If omitted or None, the
77 current time is used. This module ignores the timestamp when
78 decompressing; however, some programs, such as gunzip, make use
79 of it. The format of the timestamp is the same as that of the
80 return value of time.time() and of the st_mtime member of the
81 object returned by os.stat().
82
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000083 """
84
Nadeem Vawdadd72b3f2012-10-21 18:15:05 +020085 # Make sure we don't inadvertently enable universal newlines on the
86 # underlying file object - in read mode, this causes data corruption.
87 if mode:
88 mode = mode.replace('U', '')
Skip Montanaro12424bc2002-05-23 01:43:05 +000089 # guarantee the file is opened in binary mode on platforms
90 # that care about that sort of thing
91 if mode and 'b' not in mode:
92 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 if fileobj is None:
Fred Drake9bb76d11999-04-05 18:33:40 +000094 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +000095 if filename is None:
Nadeem Vawdad7664de2012-01-19 00:40:46 +020096 # Issue #13781: os.fdopen() creates a fileobj with a bogus name
97 # attribute. Avoid saving this in the gzip header's filename field.
Bo Baylesafe5f632018-05-09 05:14:40 -050098 filename = getattr(fileobj, 'name', '')
99 if not isinstance(filename, basestring) or filename == '<fdopen>':
Nadeem Vawdad7664de2012-01-19 00:40:46 +0200100 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000101 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000102 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000103 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000104
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000105 if mode[0:1] == 'r':
106 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000107 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000108 self._new_member = True
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000109 # Buffer data read from gzip file. extrastart is offset in
110 # stream where buffer starts. extrasize is number of
111 # bytes remaining in buffer from current stream position.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000112 self.extrabuf = ""
113 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000114 self.extrastart = 0
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000115 self.name = filename
Bob Ippolitod82c3102006-05-22 15:59:12 +0000116 # Starts small, scales exponentially
117 self.min_readsize = 100
Guido van Rossum15262191997-04-30 16:04:57 +0000118
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000119 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000120 self.mode = WRITE
121 self._init_write(filename)
122 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000123 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000124 -zlib.MAX_WBITS,
125 zlib.DEF_MEM_LEVEL,
126 0)
127 else:
Martin v. Löwisdb044892002-03-11 06:46:52 +0000128 raise IOError, "Mode " + mode + " not supported"
Guido van Rossum15262191997-04-30 16:04:57 +0000129
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000131 self.offset = 0
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000132 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000133
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000134 if self.mode == WRITE:
135 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000136
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000137 @property
138 def filename(self):
139 import warnings
Philip Jenveyd846f1d2009-05-08 02:28:39 +0000140 warnings.warn("use the name attribute", DeprecationWarning, 2)
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000141 if self.mode == WRITE and self.name[-3:] != ".gz":
142 return self.name + ".gz"
143 return self.name
144
Guido van Rossum15262191997-04-30 16:04:57 +0000145 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000146 s = repr(self.fileobj)
147 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000148
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000149 def _check_closed(self):
150 """Raises a ValueError if the underlying file object has been closed.
151
152 """
153 if self.closed:
154 raise ValueError('I/O operation on closed file.')
155
Guido van Rossum15262191997-04-30 16:04:57 +0000156 def _init_write(self, filename):
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000157 self.name = filename
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000158 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 self.size = 0
160 self.writebuf = []
161 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000162
163 def _write_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000164 self.fileobj.write('\037\213') # magic header
165 self.fileobj.write('\010') # compression method
Serhiy Storchaka54edfb32014-10-12 22:23:28 +0300166 try:
167 # RFC 1952 requires the FNAME field to be Latin-1. Do not
168 # include filenames that cannot be represented that way.
169 fname = os.path.basename(self.name)
170 if not isinstance(fname, str):
171 fname = fname.encode('latin-1')
172 if fname.endswith('.gz'):
173 fname = fname[:-3]
174 except UnicodeEncodeError:
175 fname = ''
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000176 flags = 0
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000177 if fname:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000178 flags = FNAME
179 self.fileobj.write(chr(flags))
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000180 mtime = self.mtime
181 if mtime is None:
182 mtime = time.time()
183 write32u(self.fileobj, long(mtime))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000184 self.fileobj.write('\002')
185 self.fileobj.write('\377')
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000186 if fname:
187 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000188
189 def _init_read(self):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000190 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000191 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000192
193 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000194 magic = self.fileobj.read(2)
195 if magic != '\037\213':
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000196 raise IOError, 'Not a gzipped file'
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500197 method = ord( self.fileobj.read(1) )
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000198 if method != 8:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000199 raise IOError, 'Unknown compression method'
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500200 flag = ord( self.fileobj.read(1) )
201 self.mtime = read32(self.fileobj)
202 # extraflag = self.fileobj.read(1)
203 # os = self.fileobj.read(1)
204 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000205
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000206 if flag & FEXTRA:
207 # Read & discard the extra field, if present
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500208 xlen = ord(self.fileobj.read(1))
209 xlen = xlen + 256*ord(self.fileobj.read(1))
210 self.fileobj.read(xlen)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 if flag & FNAME:
212 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000213 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000214 s = self.fileobj.read(1)
215 if not s or s=='\000':
216 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000217 if flag & FCOMMENT:
218 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000219 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000220 s = self.fileobj.read(1)
221 if not s or s=='\000':
222 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000223 if flag & FHCRC:
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500224 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000225
Guido van Rossum15262191997-04-30 16:04:57 +0000226 def write(self,data):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000227 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000228 if self.mode != WRITE:
229 import errno
230 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000231
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000232 if self.fileobj is None:
233 raise ValueError, "write() on closed GzipFile object"
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000234
235 # Convert data type if called by io.BufferedWriter.
236 if isinstance(data, memoryview):
237 data = data.tobytes()
238
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000239 if len(data) > 0:
Serhiy Storchakaf689f102015-03-23 15:25:18 +0200240 self.fileobj.write(self.compress.compress(data))
241 self.size += len(data)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000242 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000243 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000244
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000245 return len(data)
246
Guido van Rossum56068012000-02-02 16:51:06 +0000247 def read(self, size=-1):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000248 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000249 if self.mode != READ:
250 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000251 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000252
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000253 if self.extrasize <= 0 and self.fileobj is None:
254 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000255
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000257 if size < 0: # get the whole thing
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500258 try:
259 while True:
260 self._read(readsize)
261 readsize = min(self.max_read_chunk, readsize * 2)
262 except EOFError:
263 size = self.extrasize
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000264 else: # just get some more of it
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500265 try:
266 while size > self.extrasize:
267 self._read(readsize)
268 readsize = min(self.max_read_chunk, readsize * 2)
269 except EOFError:
270 if size > self.extrasize:
271 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000272
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000273 offset = self.offset - self.extrastart
274 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000275 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000276
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000277 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000278 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000279
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000280 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000281 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000282 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000283
284 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000285 if self.fileobj is None:
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500286 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000287
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000288 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000289 # If the _new_member flag is set, we have to
290 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000291 #
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000292 # First, check if we're at the end of the file;
293 # if so, it's time to stop; no more members to read.
294 pos = self.fileobj.tell() # Save current position
295 self.fileobj.seek(0, 2) # Seek to end of file
296 if pos == self.fileobj.tell():
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500297 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000298 else:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000299 self.fileobj.seek( pos ) # Return to original position
Tim Peters07e99cb2001-01-14 23:47:14 +0000300
301 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000302 self._read_gzip_header()
303 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000304 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000305
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000306 # Read a chunk of data from the file
307 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000308
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000309 # If the EOF has been reached, flush the decompression object
310 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000311
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000312 if buf == "":
313 uncompress = self.decompress.flush()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000314 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000315 self._add_read_data( uncompress )
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500316 raise EOFError, 'Reached EOF'
Tim Peters07e99cb2001-01-14 23:47:14 +0000317
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000318 uncompress = self.decompress.decompress(buf)
319 self._add_read_data( uncompress )
320
321 if self.decompress.unused_data != "":
322 # Ending case: we've come to the end of a member in the file,
323 # so seek back to the start of the unused data, finish up
324 # this member, and read a new gzip header.
325 # (The number of bytes to seek back is the length of the unused
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500326 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
327 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000328
329 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000330 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000331 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000332 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000333
334 def _add_read_data(self, data):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000335 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000336 offset = self.offset - self.extrastart
337 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000338 self.extrasize = self.extrasize + len(data)
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000339 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000340 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000341
342 def _read_eof(self):
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500343 # We've read to the end of the file, so we have to rewind in order
344 # to reread the 8 bytes containing the CRC and the file size.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000345 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000346 # uncompressed data matches the stored values. Note that the size
347 # stored is the true file size mod 2**32.
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500348 self.fileobj.seek(-8, 1)
349 crc32 = read32(self.fileobj)
350 isize = read32(self.fileobj) # may exceed 2GB
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000351 if crc32 != self.crc:
352 raise IOError("CRC check failed %s != %s" % (hex(crc32),
353 hex(self.crc)))
Gregory P. Smithac830e92008-03-23 23:43:02 +0000354 elif isize != (self.size & 0xffffffffL):
Andrew M. Kuchling64edd6a2003-02-05 21:35:07 +0000355 raise IOError, "Incorrect length of data produced"
Tim Peters07e99cb2001-01-14 23:47:14 +0000356
Antoine Pitrou5a9112c2010-01-13 14:32:10 +0000357 # Gzip files can be padded with zeroes and still have archives.
358 # Consume all zero bytes and set the file position to the first
359 # non-zero byte. See http://www.gzip.org/#faq8
360 c = "\x00"
361 while c == "\x00":
362 c = self.fileobj.read(1)
363 if c:
364 self.fileobj.seek(-1, 1)
365
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000366 @property
367 def closed(self):
368 return self.fileobj is None
369
Guido van Rossum15262191997-04-30 16:04:57 +0000370 def close(self):
Serhiy Storchaka1aa2c0f2015-04-10 13:24:10 +0300371 fileobj = self.fileobj
372 if fileobj is None:
Georg Brandle08e3d02008-05-25 08:07:37 +0000373 return
Serhiy Storchaka1aa2c0f2015-04-10 13:24:10 +0300374 self.fileobj = None
375 try:
376 if self.mode == WRITE:
377 fileobj.write(self.compress.flush())
378 write32u(fileobj, self.crc)
379 # self.size may exceed 2GB, or even 4GB
380 write32u(fileobj, self.size & 0xffffffffL)
381 finally:
382 myfileobj = self.myfileobj
383 if myfileobj:
384 self.myfileobj = None
385 myfileobj.close()
Guido van Rossum15262191997-04-30 16:04:57 +0000386
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000387 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000388 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000389 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000390 # Ensure the compressor's buffer is flushed
391 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinson3b0b4ff2010-05-04 18:45:27 +0000392 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000393
Tim Peters5cfb05e2004-07-27 21:02:02 +0000394 def fileno(self):
395 """Invoke the underlying file object's fileno() method.
396
397 This will raise AttributeError if the underlying file object
398 doesn't support fileno().
399 """
400 return self.fileobj.fileno()
401
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000402 def rewind(self):
403 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000404 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000405 if self.mode != READ:
406 raise IOError("Can't rewind in write mode")
407 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000408 self._new_member = True
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000409 self.extrabuf = ""
410 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000411 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000412 self.offset = 0
413
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000414 def readable(self):
415 return self.mode == READ
416
417 def writable(self):
418 return self.mode == WRITE
419
420 def seekable(self):
421 return True
422
Martin v. Löwis065f0c82006-11-12 10:41:39 +0000423 def seek(self, offset, whence=0):
424 if whence:
425 if whence == 1:
426 offset = self.offset + offset
427 else:
428 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000429 if self.mode == WRITE:
430 if offset < self.offset:
431 raise IOError('Negative seek in write mode')
432 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000433 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000434 self.write(1024 * '\0')
435 self.write((count % 1024) * '\0')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000436 elif self.mode == READ:
437 if offset < self.offset:
438 # for negative seek, rewind and do positive seek
439 self.rewind()
440 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000441 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000442 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000443 self.read(count % 1024)
444
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000445 return self.offset
446
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000447 def readline(self, size=-1):
Bob Ippolitod82c3102006-05-22 15:59:12 +0000448 if size < 0:
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000449 # Shortcut common case - newline found in buffer.
450 offset = self.offset - self.extrastart
451 i = self.extrabuf.find('\n', offset) + 1
452 if i > 0:
453 self.extrasize -= i - offset
454 self.offset += i - offset
455 return self.extrabuf[offset: i]
456
Bob Ippolitod82c3102006-05-22 15:59:12 +0000457 size = sys.maxint
458 readsize = self.min_readsize
459 else:
460 readsize = size
Bob Ippolitob9759732006-05-22 15:22:46 +0000461 bufs = []
Bob Ippolitod82c3102006-05-22 15:59:12 +0000462 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000463 c = self.read(readsize)
Eric S. Raymondee5e61d2001-02-09 09:10:35 +0000464 i = c.find('\n')
Bob Ippolitod82c3102006-05-22 15:59:12 +0000465
466 # We set i=size to break out of the loop under two
467 # conditions: 1) there's no newline, and the chunk is
468 # larger than size, or 2) there is a newline, but the
469 # resulting line would be longer than 'size'.
470 if (size <= i) or (i == -1 and len(c) > size):
471 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000472
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000473 if i >= 0 or c == '':
Bob Ippolitod82c3102006-05-22 15:59:12 +0000474 bufs.append(c[:i + 1]) # Add portion of last chunk
475 self._unread(c[i + 1:]) # Push back rest of chunk
476 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000477
Bob Ippolitob9759732006-05-22 15:22:46 +0000478 # Append chunk to list, decrease 'size',
479 bufs.append(c)
480 size = size - len(c)
481 readsize = min(size, readsize * 2)
Bob Ippolitod82c3102006-05-22 15:59:12 +0000482 if readsize > self.min_readsize:
483 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
484 return ''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000485
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000486
487def _test():
488 # Act like gzip; with -d, act like gunzip.
489 # The input file is not deleted, however, nor are any other gzip
490 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000491 args = sys.argv[1:]
492 decompress = args and args[0] == "-d"
493 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000494 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000495 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000496 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000497 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000498 if decompress:
499 if arg == "-":
500 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
501 g = sys.stdout
502 else:
503 if arg[-3:] != ".gz":
Walter Dörwald70a6b492004-02-12 17:35:32 +0000504 print "filename doesn't end in .gz:", repr(arg)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000505 continue
506 f = open(arg, "rb")
507 g = __builtin__.open(arg[:-3], "wb")
508 else:
509 if arg == "-":
510 f = sys.stdin
511 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
512 else:
513 f = __builtin__.open(arg, "rb")
514 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000515 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000516 chunk = f.read(1024)
517 if not chunk:
518 break
519 g.write(chunk)
520 if g is not sys.stdout:
521 g.close()
522 if f is not sys.stdin:
523 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000524
525if __name__ == '__main__':
526 _test()