blob: 99b2d3ef078948e1e5010d2fd1cadb5a1bc8040c [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Antoine Pitrou673ddf92010-01-03 22:29:56 +000010import io
Guido van Rossum68de3791997-07-19 20:22:23 +000011import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +000012
Skip Montanaro2dd42762001-01-23 15:35:05 +000013__all__ = ["GzipFile","open"]
14
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Fred Drakefa1591c1999-04-05 18:37:59 +000024def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000025 """Shorthand for GzipFile(filename, mode, compresslevel).
26
27 The filename argument is required; mode defaults to 'rb'
28 and compresslevel defaults to 9.
29
30 """
Guido van Rossum15262191997-04-30 16:04:57 +000031 return GzipFile(filename, mode, compresslevel)
32
Antoine Pitrou673ddf92010-01-03 22:29:56 +000033class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000034 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000035 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000036
37 """
Guido van Rossum15262191997-04-30 16:04:57 +000038
Guido van Rossum68de3791997-07-19 20:22:23 +000039 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000040 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +000041
Tim Peters07e99cb2001-01-14 23:47:14 +000042 def __init__(self, filename=None, mode=None,
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000043 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000044 """Constructor for the GzipFile class.
45
46 At least one of fileobj and filename must be given a
47 non-trivial value.
48
49 The new class instance is based on fileobj, which can be a regular
50 file, a StringIO object, or any other object which simulates a file.
51 It defaults to None, in which case filename is opened to provide
52 a file object.
53
54 When fileobj is not None, the filename argument is only used to be
55 included in the gzip file header, which may includes the original
56 filename of the uncompressed file. It defaults to the filename of
57 fileobj, if discernible; otherwise, it defaults to the empty string,
58 and in this case the original filename is not included in the header.
59
60 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
61 depending on whether the file will be read or written. The default
62 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
63 Be aware that only the 'rb', 'ab', and 'wb' values should be used
64 for cross-platform portability.
65
Nadeem Vawda04050b82012-11-11 13:52:10 +010066 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000067 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda04050b82012-11-11 13:52:10 +010068 and 9 is slowest and produces the most compression. 0 is no compression
69 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000070
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000071 The mtime argument is an optional numeric timestamp to be written
72 to the stream when compressing. All gzip compressed streams
73 are required to contain a timestamp. If omitted or None, the
74 current time is used. This module ignores the timestamp when
75 decompressing; however, some programs, such as gunzip, make use
76 of it. The format of the timestamp is the same as that of the
77 return value of time.time() and of the st_mtime member of the
78 object returned by os.stat().
79
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000080 """
81
Nadeem Vawdadd72b3f2012-10-21 18:15:05 +020082 # Make sure we don't inadvertently enable universal newlines on the
83 # underlying file object - in read mode, this causes data corruption.
84 if mode:
85 mode = mode.replace('U', '')
Skip Montanaro12424bc2002-05-23 01:43:05 +000086 # guarantee the file is opened in binary mode on platforms
87 # that care about that sort of thing
88 if mode and 'b' not in mode:
89 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000090 if fileobj is None:
Fred Drake9bb76d11999-04-05 18:33:40 +000091 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +000092 if filename is None:
Nadeem Vawdad7664de2012-01-19 00:40:46 +020093 # Issue #13781: os.fdopen() creates a fileobj with a bogus name
94 # attribute. Avoid saving this in the gzip header's filename field.
95 if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
96 filename = fileobj.name
97 else:
98 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +000099 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000100 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000101 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000102
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 if mode[0:1] == 'r':
104 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000105 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000106 self._new_member = True
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000107 # Buffer data read from gzip file. extrastart is offset in
108 # stream where buffer starts. extrasize is number of
109 # bytes remaining in buffer from current stream position.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000110 self.extrabuf = ""
111 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000112 self.extrastart = 0
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000113 self.name = filename
Bob Ippolitod82c3102006-05-22 15:59:12 +0000114 # Starts small, scales exponentially
115 self.min_readsize = 100
Guido van Rossum15262191997-04-30 16:04:57 +0000116
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000117 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000118 self.mode = WRITE
119 self._init_write(filename)
120 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000121 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000122 -zlib.MAX_WBITS,
123 zlib.DEF_MEM_LEVEL,
124 0)
125 else:
Martin v. Löwisdb044892002-03-11 06:46:52 +0000126 raise IOError, "Mode " + mode + " not supported"
Guido van Rossum15262191997-04-30 16:04:57 +0000127
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000128 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000129 self.offset = 0
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000130 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000131
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000132 if self.mode == WRITE:
133 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000134
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000135 @property
136 def filename(self):
137 import warnings
Philip Jenveyd846f1d2009-05-08 02:28:39 +0000138 warnings.warn("use the name attribute", DeprecationWarning, 2)
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000139 if self.mode == WRITE and self.name[-3:] != ".gz":
140 return self.name + ".gz"
141 return self.name
142
Guido van Rossum15262191997-04-30 16:04:57 +0000143 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000144 s = repr(self.fileobj)
145 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000146
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000147 def _check_closed(self):
148 """Raises a ValueError if the underlying file object has been closed.
149
150 """
151 if self.closed:
152 raise ValueError('I/O operation on closed file.')
153
Guido van Rossum15262191997-04-30 16:04:57 +0000154 def _init_write(self, filename):
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000155 self.name = filename
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000156 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000157 self.size = 0
158 self.writebuf = []
159 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000160
161 def _write_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000162 self.fileobj.write('\037\213') # magic header
163 self.fileobj.write('\010') # compression method
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +0000164 fname = os.path.basename(self.name)
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000165 if fname.endswith(".gz"):
166 fname = fname[:-3]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000167 flags = 0
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000168 if fname:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000169 flags = FNAME
170 self.fileobj.write(chr(flags))
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000171 mtime = self.mtime
172 if mtime is None:
173 mtime = time.time()
174 write32u(self.fileobj, long(mtime))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000175 self.fileobj.write('\002')
176 self.fileobj.write('\377')
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000177 if fname:
178 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000179
180 def _init_read(self):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000181 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000182 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000183
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200184 def _read_exact(self, n):
185 data = self.fileobj.read(n)
186 while len(data) < n:
187 b = self.fileobj.read(n - len(data))
188 if not b:
189 raise EOFError("Compressed file ended before the "
190 "end-of-stream marker was reached")
191 data += b
192 return data
193
Guido van Rossum15262191997-04-30 16:04:57 +0000194 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 magic = self.fileobj.read(2)
196 if magic != '\037\213':
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000197 raise IOError, 'Not a gzipped file'
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200198
199 method, flag, self.mtime = struct.unpack("<BBIxx", self._read_exact(8))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000200 if method != 8:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000201 raise IOError, 'Unknown compression method'
Guido van Rossum15262191997-04-30 16:04:57 +0000202
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000203 if flag & FEXTRA:
204 # Read & discard the extra field, if present
Serhiy Storchaka371432b2013-04-08 22:33:55 +0300205 extra_len, = struct.unpack("<H", self._read_exact(2))
206 self._read_exact(extra_len)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000207 if flag & FNAME:
208 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000209 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000210 s = self.fileobj.read(1)
211 if not s or s=='\000':
212 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 if flag & FCOMMENT:
214 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000215 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000216 s = self.fileobj.read(1)
217 if not s or s=='\000':
218 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000219 if flag & FHCRC:
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200220 self._read_exact(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000221
Guido van Rossum15262191997-04-30 16:04:57 +0000222 def write(self,data):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000223 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000224 if self.mode != WRITE:
225 import errno
226 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000227
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000228 if self.fileobj is None:
229 raise ValueError, "write() on closed GzipFile object"
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000230
231 # Convert data type if called by io.BufferedWriter.
232 if isinstance(data, memoryview):
233 data = data.tobytes()
234
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000235 if len(data) > 0:
236 self.size = self.size + len(data)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000237 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000238 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000239 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000240
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000241 return len(data)
242
Guido van Rossum56068012000-02-02 16:51:06 +0000243 def read(self, size=-1):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000244 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000245 if self.mode != READ:
246 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000247 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000248
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000249 if self.extrasize <= 0 and self.fileobj is None:
250 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000251
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000252 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000253 if size < 0: # get the whole thing
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200254 while self._read(readsize):
255 readsize = min(self.max_read_chunk, readsize * 2)
256 size = self.extrasize
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000257 else: # just get some more of it
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200258 while size > self.extrasize:
259 if not self._read(readsize):
260 if size > self.extrasize:
261 size = self.extrasize
262 break
263 readsize = min(self.max_read_chunk, readsize * 2)
Tim Peters07e99cb2001-01-14 23:47:14 +0000264
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000265 offset = self.offset - self.extrastart
266 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000267 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000268
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000269 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000270 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000271
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000272 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000273 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000274 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000275
276 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000277 if self.fileobj is None:
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200278 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000279
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000280 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000281 # If the _new_member flag is set, we have to
282 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000283 #
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000284 # First, check if we're at the end of the file;
285 # if so, it's time to stop; no more members to read.
286 pos = self.fileobj.tell() # Save current position
287 self.fileobj.seek(0, 2) # Seek to end of file
288 if pos == self.fileobj.tell():
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200289 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000290 else:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000291 self.fileobj.seek( pos ) # Return to original position
Tim Peters07e99cb2001-01-14 23:47:14 +0000292
293 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000294 self._read_gzip_header()
295 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000296 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000297
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000298 # Read a chunk of data from the file
299 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000300
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000301 # If the EOF has been reached, flush the decompression object
302 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000303
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000304 if buf == "":
305 uncompress = self.decompress.flush()
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200306 self.fileobj.seek(-len(self.decompress.unused_data), 1)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000307 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000308 self._add_read_data( uncompress )
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200309 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000310
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000311 uncompress = self.decompress.decompress(buf)
312 self._add_read_data( uncompress )
313
314 if self.decompress.unused_data != "":
315 # Ending case: we've come to the end of a member in the file,
316 # so seek back to the start of the unused data, finish up
317 # this member, and read a new gzip header.
318 # (The number of bytes to seek back is the length of the unused
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200319 # data)
320 self.fileobj.seek(-len(self.decompress.unused_data), 1)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000321
322 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000323 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000324 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000325 self._new_member = True
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200326 return True
Tim Peters07e99cb2001-01-14 23:47:14 +0000327
328 def _add_read_data(self, data):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000329 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000330 offset = self.offset - self.extrastart
331 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000332 self.extrasize = self.extrasize + len(data)
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000333 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000334 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000335
336 def _read_eof(self):
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200337 # We've read to the end of the file.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000338 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000339 # uncompressed data matches the stored values. Note that the size
340 # stored is the true file size mod 2**32.
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200341 crc32, isize = struct.unpack("<II", self._read_exact(8))
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000342 if crc32 != self.crc:
343 raise IOError("CRC check failed %s != %s" % (hex(crc32),
344 hex(self.crc)))
Gregory P. Smithac830e92008-03-23 23:43:02 +0000345 elif isize != (self.size & 0xffffffffL):
Andrew M. Kuchling64edd6a2003-02-05 21:35:07 +0000346 raise IOError, "Incorrect length of data produced"
Tim Peters07e99cb2001-01-14 23:47:14 +0000347
Antoine Pitrou5a9112c2010-01-13 14:32:10 +0000348 # Gzip files can be padded with zeroes and still have archives.
349 # Consume all zero bytes and set the file position to the first
350 # non-zero byte. See http://www.gzip.org/#faq8
351 c = "\x00"
352 while c == "\x00":
353 c = self.fileobj.read(1)
354 if c:
355 self.fileobj.seek(-1, 1)
356
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000357 @property
358 def closed(self):
359 return self.fileobj is None
360
Guido van Rossum15262191997-04-30 16:04:57 +0000361 def close(self):
Georg Brandle08e3d02008-05-25 08:07:37 +0000362 if self.fileobj is None:
363 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000364 if self.mode == WRITE:
365 self.fileobj.write(self.compress.flush())
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000366 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000367 # self.size may exceed 2GB, or even 4GB
Gregory P. Smithdd102842008-03-23 23:45:12 +0000368 write32u(self.fileobj, self.size & 0xffffffffL)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000369 self.fileobj = None
370 elif self.mode == READ:
371 self.fileobj = None
372 if self.myfileobj:
373 self.myfileobj.close()
374 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000375
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000376 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000377 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000378 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000379 # Ensure the compressor's buffer is flushed
380 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinson3b0b4ff2010-05-04 18:45:27 +0000381 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000382
Tim Peters5cfb05e2004-07-27 21:02:02 +0000383 def fileno(self):
384 """Invoke the underlying file object's fileno() method.
385
386 This will raise AttributeError if the underlying file object
387 doesn't support fileno().
388 """
389 return self.fileobj.fileno()
390
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000391 def rewind(self):
392 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000393 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000394 if self.mode != READ:
395 raise IOError("Can't rewind in write mode")
396 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000397 self._new_member = True
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000398 self.extrabuf = ""
399 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000400 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000401 self.offset = 0
402
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000403 def readable(self):
404 return self.mode == READ
405
406 def writable(self):
407 return self.mode == WRITE
408
409 def seekable(self):
410 return True
411
Martin v. Löwis065f0c82006-11-12 10:41:39 +0000412 def seek(self, offset, whence=0):
413 if whence:
414 if whence == 1:
415 offset = self.offset + offset
416 else:
417 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000418 if self.mode == WRITE:
419 if offset < self.offset:
420 raise IOError('Negative seek in write mode')
421 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000422 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000423 self.write(1024 * '\0')
424 self.write((count % 1024) * '\0')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000425 elif self.mode == READ:
426 if offset < self.offset:
427 # for negative seek, rewind and do positive seek
428 self.rewind()
429 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000430 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000431 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000432 self.read(count % 1024)
433
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000434 return self.offset
435
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000436 def readline(self, size=-1):
Bob Ippolitod82c3102006-05-22 15:59:12 +0000437 if size < 0:
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000438 # Shortcut common case - newline found in buffer.
439 offset = self.offset - self.extrastart
440 i = self.extrabuf.find('\n', offset) + 1
441 if i > 0:
442 self.extrasize -= i - offset
443 self.offset += i - offset
444 return self.extrabuf[offset: i]
445
Bob Ippolitod82c3102006-05-22 15:59:12 +0000446 size = sys.maxint
447 readsize = self.min_readsize
448 else:
449 readsize = size
Bob Ippolitob9759732006-05-22 15:22:46 +0000450 bufs = []
Bob Ippolitod82c3102006-05-22 15:59:12 +0000451 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000452 c = self.read(readsize)
Eric S. Raymondee5e61d2001-02-09 09:10:35 +0000453 i = c.find('\n')
Bob Ippolitod82c3102006-05-22 15:59:12 +0000454
455 # We set i=size to break out of the loop under two
456 # conditions: 1) there's no newline, and the chunk is
457 # larger than size, or 2) there is a newline, but the
458 # resulting line would be longer than 'size'.
459 if (size <= i) or (i == -1 and len(c) > size):
460 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000461
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000462 if i >= 0 or c == '':
Bob Ippolitod82c3102006-05-22 15:59:12 +0000463 bufs.append(c[:i + 1]) # Add portion of last chunk
464 self._unread(c[i + 1:]) # Push back rest of chunk
465 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000466
Bob Ippolitob9759732006-05-22 15:22:46 +0000467 # Append chunk to list, decrease 'size',
468 bufs.append(c)
469 size = size - len(c)
470 readsize = min(size, readsize * 2)
Bob Ippolitod82c3102006-05-22 15:59:12 +0000471 if readsize > self.min_readsize:
472 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
473 return ''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000474
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000475
476def _test():
477 # Act like gzip; with -d, act like gunzip.
478 # The input file is not deleted, however, nor are any other gzip
479 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000480 args = sys.argv[1:]
481 decompress = args and args[0] == "-d"
482 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000483 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000484 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000485 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000486 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000487 if decompress:
488 if arg == "-":
489 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
490 g = sys.stdout
491 else:
492 if arg[-3:] != ".gz":
Walter Dörwald70a6b492004-02-12 17:35:32 +0000493 print "filename doesn't end in .gz:", repr(arg)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000494 continue
495 f = open(arg, "rb")
496 g = __builtin__.open(arg[:-3], "wb")
497 else:
498 if arg == "-":
499 f = sys.stdin
500 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
501 else:
502 f = __builtin__.open(arg, "rb")
503 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000504 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000505 chunk = f.read(1024)
506 if not chunk:
507 break
508 g.write(chunk)
509 if g is not sys.stdout:
510 g.close()
511 if f is not sys.stdin:
512 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000513
514if __name__ == '__main__':
515 _test()