blob: a613bae876dd3444162a81f21875c75133d21a42 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Antoine Pitrou673ddf92010-01-03 22:29:56 +000010import io
Guido van Rossum68de3791997-07-19 20:22:23 +000011import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +000012
Skip Montanaro2dd42762001-01-23 15:35:05 +000013__all__ = ["GzipFile","open"]
14
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Benjamin Peterson6e165b42013-05-11 13:17:13 -050024def read32(input):
25 return struct.unpack("<I", input.read(4))[0]
26
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou673ddf92010-01-03 22:29:56 +000036class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000037 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000038 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000039
40 """
Guido van Rossum15262191997-04-30 16:04:57 +000041
Guido van Rossum68de3791997-07-19 20:22:23 +000042 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000043 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +000044
Tim Peters07e99cb2001-01-14 23:47:14 +000045 def __init__(self, filename=None, mode=None,
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000046 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000047 """Constructor for the GzipFile class.
48
49 At least one of fileobj and filename must be given a
50 non-trivial value.
51
52 The new class instance is based on fileobj, which can be a regular
53 file, a StringIO object, or any other object which simulates a file.
54 It defaults to None, in which case filename is opened to provide
55 a file object.
56
57 When fileobj is not None, the filename argument is only used to be
58 included in the gzip file header, which may includes the original
59 filename of the uncompressed file. It defaults to the filename of
60 fileobj, if discernible; otherwise, it defaults to the empty string,
61 and in this case the original filename is not included in the header.
62
63 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64 depending on whether the file will be read or written. The default
65 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66 Be aware that only the 'rb', 'ab', and 'wb' values should be used
67 for cross-platform portability.
68
Nadeem Vawda04050b82012-11-11 13:52:10 +010069 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000070 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda04050b82012-11-11 13:52:10 +010071 and 9 is slowest and produces the most compression. 0 is no compression
72 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000073
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000074 The mtime argument is an optional numeric timestamp to be written
75 to the stream when compressing. All gzip compressed streams
76 are required to contain a timestamp. If omitted or None, the
77 current time is used. This module ignores the timestamp when
78 decompressing; however, some programs, such as gunzip, make use
79 of it. The format of the timestamp is the same as that of the
80 return value of time.time() and of the st_mtime member of the
81 object returned by os.stat().
82
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000083 """
84
Nadeem Vawdadd72b3f2012-10-21 18:15:05 +020085 # Make sure we don't inadvertently enable universal newlines on the
86 # underlying file object - in read mode, this causes data corruption.
87 if mode:
88 mode = mode.replace('U', '')
Skip Montanaro12424bc2002-05-23 01:43:05 +000089 # guarantee the file is opened in binary mode on platforms
90 # that care about that sort of thing
91 if mode and 'b' not in mode:
92 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 if fileobj is None:
Fred Drake9bb76d11999-04-05 18:33:40 +000094 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +000095 if filename is None:
Nadeem Vawdad7664de2012-01-19 00:40:46 +020096 # Issue #13781: os.fdopen() creates a fileobj with a bogus name
97 # attribute. Avoid saving this in the gzip header's filename field.
98 if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
99 filename = fileobj.name
100 else:
101 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000102 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000104 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000105
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000106 if mode[0:1] == 'r':
107 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000108 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000109 self._new_member = True
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000110 # Buffer data read from gzip file. extrastart is offset in
111 # stream where buffer starts. extrasize is number of
112 # bytes remaining in buffer from current stream position.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000113 self.extrabuf = ""
114 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000115 self.extrastart = 0
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000116 self.name = filename
Bob Ippolitod82c3102006-05-22 15:59:12 +0000117 # Starts small, scales exponentially
118 self.min_readsize = 100
Guido van Rossum15262191997-04-30 16:04:57 +0000119
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000120 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000121 self.mode = WRITE
122 self._init_write(filename)
123 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000124 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000125 -zlib.MAX_WBITS,
126 zlib.DEF_MEM_LEVEL,
127 0)
128 else:
Martin v. Löwisdb044892002-03-11 06:46:52 +0000129 raise IOError, "Mode " + mode + " not supported"
Guido van Rossum15262191997-04-30 16:04:57 +0000130
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000131 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000132 self.offset = 0
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000133 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000134
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000135 if self.mode == WRITE:
136 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000137
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000138 @property
139 def filename(self):
140 import warnings
Philip Jenveyd846f1d2009-05-08 02:28:39 +0000141 warnings.warn("use the name attribute", DeprecationWarning, 2)
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000142 if self.mode == WRITE and self.name[-3:] != ".gz":
143 return self.name + ".gz"
144 return self.name
145
Guido van Rossum15262191997-04-30 16:04:57 +0000146 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000147 s = repr(self.fileobj)
148 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000149
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000150 def _check_closed(self):
151 """Raises a ValueError if the underlying file object has been closed.
152
153 """
154 if self.closed:
155 raise ValueError('I/O operation on closed file.')
156
Guido van Rossum15262191997-04-30 16:04:57 +0000157 def _init_write(self, filename):
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000158 self.name = filename
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000159 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000160 self.size = 0
161 self.writebuf = []
162 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000163
164 def _write_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000165 self.fileobj.write('\037\213') # magic header
166 self.fileobj.write('\010') # compression method
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +0000167 fname = os.path.basename(self.name)
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000168 if fname.endswith(".gz"):
169 fname = fname[:-3]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000170 flags = 0
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000171 if fname:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 flags = FNAME
173 self.fileobj.write(chr(flags))
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000174 mtime = self.mtime
175 if mtime is None:
176 mtime = time.time()
177 write32u(self.fileobj, long(mtime))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000178 self.fileobj.write('\002')
179 self.fileobj.write('\377')
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000180 if fname:
181 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000182
183 def _init_read(self):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000184 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000186
187 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000188 magic = self.fileobj.read(2)
189 if magic != '\037\213':
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000190 raise IOError, 'Not a gzipped file'
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500191 method = ord( self.fileobj.read(1) )
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000192 if method != 8:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000193 raise IOError, 'Unknown compression method'
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500194 flag = ord( self.fileobj.read(1) )
195 self.mtime = read32(self.fileobj)
196 # extraflag = self.fileobj.read(1)
197 # os = self.fileobj.read(1)
198 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000199
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000200 if flag & FEXTRA:
201 # Read & discard the extra field, if present
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500202 xlen = ord(self.fileobj.read(1))
203 xlen = xlen + 256*ord(self.fileobj.read(1))
204 self.fileobj.read(xlen)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000205 if flag & FNAME:
206 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000207 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000208 s = self.fileobj.read(1)
209 if not s or s=='\000':
210 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 if flag & FCOMMENT:
212 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000213 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000214 s = self.fileobj.read(1)
215 if not s or s=='\000':
216 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000217 if flag & FHCRC:
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500218 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000219
Guido van Rossum15262191997-04-30 16:04:57 +0000220 def write(self,data):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000221 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000222 if self.mode != WRITE:
223 import errno
224 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000225
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000226 if self.fileobj is None:
227 raise ValueError, "write() on closed GzipFile object"
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000228
229 # Convert data type if called by io.BufferedWriter.
230 if isinstance(data, memoryview):
231 data = data.tobytes()
232
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000233 if len(data) > 0:
234 self.size = self.size + len(data)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000235 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000236 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000237 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000238
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000239 return len(data)
240
Guido van Rossum56068012000-02-02 16:51:06 +0000241 def read(self, size=-1):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000242 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000243 if self.mode != READ:
244 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000245 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000246
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000247 if self.extrasize <= 0 and self.fileobj is None:
248 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000249
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000250 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000251 if size < 0: # get the whole thing
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500252 try:
253 while True:
254 self._read(readsize)
255 readsize = min(self.max_read_chunk, readsize * 2)
256 except EOFError:
257 size = self.extrasize
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000258 else: # just get some more of it
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500259 try:
260 while size > self.extrasize:
261 self._read(readsize)
262 readsize = min(self.max_read_chunk, readsize * 2)
263 except EOFError:
264 if size > self.extrasize:
265 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000266
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000267 offset = self.offset - self.extrastart
268 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000269 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000270
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000271 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000272 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000273
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000274 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000275 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000276 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000277
278 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000279 if self.fileobj is None:
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500280 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000281
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000282 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000283 # If the _new_member flag is set, we have to
284 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000285 #
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000286 # First, check if we're at the end of the file;
287 # if so, it's time to stop; no more members to read.
288 pos = self.fileobj.tell() # Save current position
289 self.fileobj.seek(0, 2) # Seek to end of file
290 if pos == self.fileobj.tell():
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500291 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000292 else:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000293 self.fileobj.seek( pos ) # Return to original position
Tim Peters07e99cb2001-01-14 23:47:14 +0000294
295 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000296 self._read_gzip_header()
297 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000298 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000299
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000300 # Read a chunk of data from the file
301 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000302
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000303 # If the EOF has been reached, flush the decompression object
304 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000305
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000306 if buf == "":
307 uncompress = self.decompress.flush()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000308 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000309 self._add_read_data( uncompress )
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500310 raise EOFError, 'Reached EOF'
Tim Peters07e99cb2001-01-14 23:47:14 +0000311
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000312 uncompress = self.decompress.decompress(buf)
313 self._add_read_data( uncompress )
314
315 if self.decompress.unused_data != "":
316 # Ending case: we've come to the end of a member in the file,
317 # so seek back to the start of the unused data, finish up
318 # this member, and read a new gzip header.
319 # (The number of bytes to seek back is the length of the unused
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500320 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
321 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000322
323 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000324 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000325 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000326 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000327
328 def _add_read_data(self, data):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000329 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000330 offset = self.offset - self.extrastart
331 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000332 self.extrasize = self.extrasize + len(data)
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000333 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000334 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000335
336 def _read_eof(self):
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500337 # We've read to the end of the file, so we have to rewind in order
338 # to reread the 8 bytes containing the CRC and the file size.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000339 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000340 # uncompressed data matches the stored values. Note that the size
341 # stored is the true file size mod 2**32.
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500342 self.fileobj.seek(-8, 1)
343 crc32 = read32(self.fileobj)
344 isize = read32(self.fileobj) # may exceed 2GB
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000345 if crc32 != self.crc:
346 raise IOError("CRC check failed %s != %s" % (hex(crc32),
347 hex(self.crc)))
Gregory P. Smithac830e92008-03-23 23:43:02 +0000348 elif isize != (self.size & 0xffffffffL):
Andrew M. Kuchling64edd6a2003-02-05 21:35:07 +0000349 raise IOError, "Incorrect length of data produced"
Tim Peters07e99cb2001-01-14 23:47:14 +0000350
Antoine Pitrou5a9112c2010-01-13 14:32:10 +0000351 # Gzip files can be padded with zeroes and still have archives.
352 # Consume all zero bytes and set the file position to the first
353 # non-zero byte. See http://www.gzip.org/#faq8
354 c = "\x00"
355 while c == "\x00":
356 c = self.fileobj.read(1)
357 if c:
358 self.fileobj.seek(-1, 1)
359
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000360 @property
361 def closed(self):
362 return self.fileobj is None
363
Guido van Rossum15262191997-04-30 16:04:57 +0000364 def close(self):
Georg Brandle08e3d02008-05-25 08:07:37 +0000365 if self.fileobj is None:
366 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000367 if self.mode == WRITE:
368 self.fileobj.write(self.compress.flush())
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000369 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000370 # self.size may exceed 2GB, or even 4GB
Gregory P. Smithdd102842008-03-23 23:45:12 +0000371 write32u(self.fileobj, self.size & 0xffffffffL)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000372 self.fileobj = None
373 elif self.mode == READ:
374 self.fileobj = None
375 if self.myfileobj:
376 self.myfileobj.close()
377 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000378
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000379 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000380 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000381 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000382 # Ensure the compressor's buffer is flushed
383 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinson3b0b4ff2010-05-04 18:45:27 +0000384 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000385
Tim Peters5cfb05e2004-07-27 21:02:02 +0000386 def fileno(self):
387 """Invoke the underlying file object's fileno() method.
388
389 This will raise AttributeError if the underlying file object
390 doesn't support fileno().
391 """
392 return self.fileobj.fileno()
393
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000394 def rewind(self):
395 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000396 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000397 if self.mode != READ:
398 raise IOError("Can't rewind in write mode")
399 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000400 self._new_member = True
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000401 self.extrabuf = ""
402 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000403 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000404 self.offset = 0
405
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000406 def readable(self):
407 return self.mode == READ
408
409 def writable(self):
410 return self.mode == WRITE
411
412 def seekable(self):
413 return True
414
Martin v. Löwis065f0c82006-11-12 10:41:39 +0000415 def seek(self, offset, whence=0):
416 if whence:
417 if whence == 1:
418 offset = self.offset + offset
419 else:
420 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000421 if self.mode == WRITE:
422 if offset < self.offset:
423 raise IOError('Negative seek in write mode')
424 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000425 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000426 self.write(1024 * '\0')
427 self.write((count % 1024) * '\0')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000428 elif self.mode == READ:
429 if offset < self.offset:
430 # for negative seek, rewind and do positive seek
431 self.rewind()
432 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000433 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000434 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000435 self.read(count % 1024)
436
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000437 return self.offset
438
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000439 def readline(self, size=-1):
Bob Ippolitod82c3102006-05-22 15:59:12 +0000440 if size < 0:
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000441 # Shortcut common case - newline found in buffer.
442 offset = self.offset - self.extrastart
443 i = self.extrabuf.find('\n', offset) + 1
444 if i > 0:
445 self.extrasize -= i - offset
446 self.offset += i - offset
447 return self.extrabuf[offset: i]
448
Bob Ippolitod82c3102006-05-22 15:59:12 +0000449 size = sys.maxint
450 readsize = self.min_readsize
451 else:
452 readsize = size
Bob Ippolitob9759732006-05-22 15:22:46 +0000453 bufs = []
Bob Ippolitod82c3102006-05-22 15:59:12 +0000454 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000455 c = self.read(readsize)
Eric S. Raymondee5e61d2001-02-09 09:10:35 +0000456 i = c.find('\n')
Bob Ippolitod82c3102006-05-22 15:59:12 +0000457
458 # We set i=size to break out of the loop under two
459 # conditions: 1) there's no newline, and the chunk is
460 # larger than size, or 2) there is a newline, but the
461 # resulting line would be longer than 'size'.
462 if (size <= i) or (i == -1 and len(c) > size):
463 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000464
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000465 if i >= 0 or c == '':
Bob Ippolitod82c3102006-05-22 15:59:12 +0000466 bufs.append(c[:i + 1]) # Add portion of last chunk
467 self._unread(c[i + 1:]) # Push back rest of chunk
468 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000469
Bob Ippolitob9759732006-05-22 15:22:46 +0000470 # Append chunk to list, decrease 'size',
471 bufs.append(c)
472 size = size - len(c)
473 readsize = min(size, readsize * 2)
Bob Ippolitod82c3102006-05-22 15:59:12 +0000474 if readsize > self.min_readsize:
475 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
476 return ''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000477
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000478
479def _test():
480 # Act like gzip; with -d, act like gunzip.
481 # The input file is not deleted, however, nor are any other gzip
482 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000483 args = sys.argv[1:]
484 decompress = args and args[0] == "-d"
485 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000486 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000487 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000488 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000489 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000490 if decompress:
491 if arg == "-":
492 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
493 g = sys.stdout
494 else:
495 if arg[-3:] != ".gz":
Walter Dörwald70a6b492004-02-12 17:35:32 +0000496 print "filename doesn't end in .gz:", repr(arg)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000497 continue
498 f = open(arg, "rb")
499 g = __builtin__.open(arg[:-3], "wb")
500 else:
501 if arg == "-":
502 f = sys.stdin
503 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
504 else:
505 f = __builtin__.open(arg, "rb")
506 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000507 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000508 chunk = f.read(1024)
509 if not chunk:
510 break
511 g.write(chunk)
512 if g is not sys.stdout:
513 g.close()
514 if f is not sys.stdin:
515 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000516
517if __name__ == '__main__':
518 _test()