blob: a2f23679fa8ccca0a48fb045fe7fa58a2eafa080 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Antoine Pitrou673ddf92010-01-03 22:29:56 +000010import io
Guido van Rossum68de3791997-07-19 20:22:23 +000011import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +000012
Skip Montanaro2dd42762001-01-23 15:35:05 +000013__all__ = ["GzipFile","open"]
14
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Fred Drakefa1591c1999-04-05 18:37:59 +000024def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000025 """Shorthand for GzipFile(filename, mode, compresslevel).
26
27 The filename argument is required; mode defaults to 'rb'
28 and compresslevel defaults to 9.
29
30 """
Guido van Rossum15262191997-04-30 16:04:57 +000031 return GzipFile(filename, mode, compresslevel)
32
Antoine Pitrou673ddf92010-01-03 22:29:56 +000033class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000034 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000035 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000036
37 """
Guido van Rossum15262191997-04-30 16:04:57 +000038
Guido van Rossum68de3791997-07-19 20:22:23 +000039 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000040 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +000041
Tim Peters07e99cb2001-01-14 23:47:14 +000042 def __init__(self, filename=None, mode=None,
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000043 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000044 """Constructor for the GzipFile class.
45
46 At least one of fileobj and filename must be given a
47 non-trivial value.
48
49 The new class instance is based on fileobj, which can be a regular
50 file, a StringIO object, or any other object which simulates a file.
51 It defaults to None, in which case filename is opened to provide
52 a file object.
53
54 When fileobj is not None, the filename argument is only used to be
55 included in the gzip file header, which may includes the original
56 filename of the uncompressed file. It defaults to the filename of
57 fileobj, if discernible; otherwise, it defaults to the empty string,
58 and in this case the original filename is not included in the header.
59
60 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
61 depending on whether the file will be read or written. The default
62 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
63 Be aware that only the 'rb', 'ab', and 'wb' values should be used
64 for cross-platform portability.
65
Nadeem Vawda04050b82012-11-11 13:52:10 +010066 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000067 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda04050b82012-11-11 13:52:10 +010068 and 9 is slowest and produces the most compression. 0 is no compression
69 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000070
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000071 The mtime argument is an optional numeric timestamp to be written
72 to the stream when compressing. All gzip compressed streams
73 are required to contain a timestamp. If omitted or None, the
74 current time is used. This module ignores the timestamp when
75 decompressing; however, some programs, such as gunzip, make use
76 of it. The format of the timestamp is the same as that of the
77 return value of time.time() and of the st_mtime member of the
78 object returned by os.stat().
79
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000080 """
81
Nadeem Vawdadd72b3f2012-10-21 18:15:05 +020082 # Make sure we don't inadvertently enable universal newlines on the
83 # underlying file object - in read mode, this causes data corruption.
84 if mode:
85 mode = mode.replace('U', '')
Skip Montanaro12424bc2002-05-23 01:43:05 +000086 # guarantee the file is opened in binary mode on platforms
87 # that care about that sort of thing
88 if mode and 'b' not in mode:
89 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000090 if fileobj is None:
Fred Drake9bb76d11999-04-05 18:33:40 +000091 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +000092 if filename is None:
Nadeem Vawdad7664de2012-01-19 00:40:46 +020093 # Issue #13781: os.fdopen() creates a fileobj with a bogus name
94 # attribute. Avoid saving this in the gzip header's filename field.
95 if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
96 filename = fileobj.name
97 else:
98 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +000099 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000100 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000101 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000102
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 if mode[0:1] == 'r':
104 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000105 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000106 self._new_member = True
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000107 # Buffer data read from gzip file. extrastart is offset in
108 # stream where buffer starts. extrasize is number of
109 # bytes remaining in buffer from current stream position.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000110 self.extrabuf = ""
111 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000112 self.extrastart = 0
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000113 self.name = filename
Bob Ippolitod82c3102006-05-22 15:59:12 +0000114 # Starts small, scales exponentially
115 self.min_readsize = 100
Guido van Rossum15262191997-04-30 16:04:57 +0000116
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000117 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000118 self.mode = WRITE
119 self._init_write(filename)
120 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000121 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000122 -zlib.MAX_WBITS,
123 zlib.DEF_MEM_LEVEL,
124 0)
125 else:
Martin v. Löwisdb044892002-03-11 06:46:52 +0000126 raise IOError, "Mode " + mode + " not supported"
Guido van Rossum15262191997-04-30 16:04:57 +0000127
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000128 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000129 self.offset = 0
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000130 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000131
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000132 if self.mode == WRITE:
133 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000134
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000135 @property
136 def filename(self):
137 import warnings
Philip Jenveyd846f1d2009-05-08 02:28:39 +0000138 warnings.warn("use the name attribute", DeprecationWarning, 2)
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000139 if self.mode == WRITE and self.name[-3:] != ".gz":
140 return self.name + ".gz"
141 return self.name
142
Guido van Rossum15262191997-04-30 16:04:57 +0000143 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000144 s = repr(self.fileobj)
145 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000146
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000147 def _check_closed(self):
148 """Raises a ValueError if the underlying file object has been closed.
149
150 """
151 if self.closed:
152 raise ValueError('I/O operation on closed file.')
153
Guido van Rossum15262191997-04-30 16:04:57 +0000154 def _init_write(self, filename):
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000155 self.name = filename
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000156 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000157 self.size = 0
158 self.writebuf = []
159 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000160
161 def _write_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000162 self.fileobj.write('\037\213') # magic header
163 self.fileobj.write('\010') # compression method
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +0000164 fname = os.path.basename(self.name)
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000165 if fname.endswith(".gz"):
166 fname = fname[:-3]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000167 flags = 0
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000168 if fname:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000169 flags = FNAME
170 self.fileobj.write(chr(flags))
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000171 mtime = self.mtime
172 if mtime is None:
173 mtime = time.time()
174 write32u(self.fileobj, long(mtime))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000175 self.fileobj.write('\002')
176 self.fileobj.write('\377')
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000177 if fname:
178 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000179
180 def _init_read(self):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000181 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000182 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000183
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200184 def _read_exact(self, n):
185 data = self.fileobj.read(n)
186 while len(data) < n:
187 b = self.fileobj.read(n - len(data))
188 if not b:
189 raise EOFError("Compressed file ended before the "
190 "end-of-stream marker was reached")
191 data += b
192 return data
193
Guido van Rossum15262191997-04-30 16:04:57 +0000194 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 magic = self.fileobj.read(2)
196 if magic != '\037\213':
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000197 raise IOError, 'Not a gzipped file'
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200198
199 method, flag, self.mtime = struct.unpack("<BBIxx", self._read_exact(8))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000200 if method != 8:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000201 raise IOError, 'Unknown compression method'
Guido van Rossum15262191997-04-30 16:04:57 +0000202
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000203 if flag & FEXTRA:
204 # Read & discard the extra field, if present
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200205 self._read_exact(struct.unpack("<H", self._read_exact(2)))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000206 if flag & FNAME:
207 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000208 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000209 s = self.fileobj.read(1)
210 if not s or s=='\000':
211 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000212 if flag & FCOMMENT:
213 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000214 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000215 s = self.fileobj.read(1)
216 if not s or s=='\000':
217 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000218 if flag & FHCRC:
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200219 self._read_exact(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000220
Guido van Rossum15262191997-04-30 16:04:57 +0000221 def write(self,data):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000222 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000223 if self.mode != WRITE:
224 import errno
225 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000226
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000227 if self.fileobj is None:
228 raise ValueError, "write() on closed GzipFile object"
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000229
230 # Convert data type if called by io.BufferedWriter.
231 if isinstance(data, memoryview):
232 data = data.tobytes()
233
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000234 if len(data) > 0:
235 self.size = self.size + len(data)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000236 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000237 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000238 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000239
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000240 return len(data)
241
Guido van Rossum56068012000-02-02 16:51:06 +0000242 def read(self, size=-1):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000243 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000244 if self.mode != READ:
245 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000246 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000247
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000248 if self.extrasize <= 0 and self.fileobj is None:
249 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000250
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000251 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000252 if size < 0: # get the whole thing
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200253 while self._read(readsize):
254 readsize = min(self.max_read_chunk, readsize * 2)
255 size = self.extrasize
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 else: # just get some more of it
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200257 while size > self.extrasize:
258 if not self._read(readsize):
259 if size > self.extrasize:
260 size = self.extrasize
261 break
262 readsize = min(self.max_read_chunk, readsize * 2)
Tim Peters07e99cb2001-01-14 23:47:14 +0000263
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000264 offset = self.offset - self.extrastart
265 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000266 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000267
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000268 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000269 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000270
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000271 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000272 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000273 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000274
275 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000276 if self.fileobj is None:
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200277 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000278
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000279 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000280 # If the _new_member flag is set, we have to
281 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000282 #
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000283 # First, check if we're at the end of the file;
284 # if so, it's time to stop; no more members to read.
285 pos = self.fileobj.tell() # Save current position
286 self.fileobj.seek(0, 2) # Seek to end of file
287 if pos == self.fileobj.tell():
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200288 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000289 else:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000290 self.fileobj.seek( pos ) # Return to original position
Tim Peters07e99cb2001-01-14 23:47:14 +0000291
292 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000293 self._read_gzip_header()
294 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000295 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000296
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000297 # Read a chunk of data from the file
298 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000299
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000300 # If the EOF has been reached, flush the decompression object
301 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000302
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 if buf == "":
304 uncompress = self.decompress.flush()
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200305 self.fileobj.seek(-len(self.decompress.unused_data), 1)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000306 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000307 self._add_read_data( uncompress )
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200308 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000309
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000310 uncompress = self.decompress.decompress(buf)
311 self._add_read_data( uncompress )
312
313 if self.decompress.unused_data != "":
314 # Ending case: we've come to the end of a member in the file,
315 # so seek back to the start of the unused data, finish up
316 # this member, and read a new gzip header.
317 # (The number of bytes to seek back is the length of the unused
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200318 # data)
319 self.fileobj.seek(-len(self.decompress.unused_data), 1)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000320
321 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000322 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000323 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000324 self._new_member = True
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200325 return True
Tim Peters07e99cb2001-01-14 23:47:14 +0000326
327 def _add_read_data(self, data):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000328 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000329 offset = self.offset - self.extrastart
330 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000331 self.extrasize = self.extrasize + len(data)
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000332 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000333 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000334
335 def _read_eof(self):
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200336 # We've read to the end of the file.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000337 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000338 # uncompressed data matches the stored values. Note that the size
339 # stored is the true file size mod 2**32.
Serhiy Storchaka353e54e2013-01-22 17:13:26 +0200340 crc32, isize = struct.unpack("<II", self._read_exact(8))
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000341 if crc32 != self.crc:
342 raise IOError("CRC check failed %s != %s" % (hex(crc32),
343 hex(self.crc)))
Gregory P. Smithac830e92008-03-23 23:43:02 +0000344 elif isize != (self.size & 0xffffffffL):
Andrew M. Kuchling64edd6a2003-02-05 21:35:07 +0000345 raise IOError, "Incorrect length of data produced"
Tim Peters07e99cb2001-01-14 23:47:14 +0000346
Antoine Pitrou5a9112c2010-01-13 14:32:10 +0000347 # Gzip files can be padded with zeroes and still have archives.
348 # Consume all zero bytes and set the file position to the first
349 # non-zero byte. See http://www.gzip.org/#faq8
350 c = "\x00"
351 while c == "\x00":
352 c = self.fileobj.read(1)
353 if c:
354 self.fileobj.seek(-1, 1)
355
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000356 @property
357 def closed(self):
358 return self.fileobj is None
359
Guido van Rossum15262191997-04-30 16:04:57 +0000360 def close(self):
Georg Brandle08e3d02008-05-25 08:07:37 +0000361 if self.fileobj is None:
362 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000363 if self.mode == WRITE:
364 self.fileobj.write(self.compress.flush())
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000365 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000366 # self.size may exceed 2GB, or even 4GB
Gregory P. Smithdd102842008-03-23 23:45:12 +0000367 write32u(self.fileobj, self.size & 0xffffffffL)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000368 self.fileobj = None
369 elif self.mode == READ:
370 self.fileobj = None
371 if self.myfileobj:
372 self.myfileobj.close()
373 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000374
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000375 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000376 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000377 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000378 # Ensure the compressor's buffer is flushed
379 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinson3b0b4ff2010-05-04 18:45:27 +0000380 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000381
Tim Peters5cfb05e2004-07-27 21:02:02 +0000382 def fileno(self):
383 """Invoke the underlying file object's fileno() method.
384
385 This will raise AttributeError if the underlying file object
386 doesn't support fileno().
387 """
388 return self.fileobj.fileno()
389
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000390 def rewind(self):
391 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000392 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000393 if self.mode != READ:
394 raise IOError("Can't rewind in write mode")
395 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000396 self._new_member = True
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000397 self.extrabuf = ""
398 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000399 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000400 self.offset = 0
401
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000402 def readable(self):
403 return self.mode == READ
404
405 def writable(self):
406 return self.mode == WRITE
407
408 def seekable(self):
409 return True
410
Martin v. Löwis065f0c82006-11-12 10:41:39 +0000411 def seek(self, offset, whence=0):
412 if whence:
413 if whence == 1:
414 offset = self.offset + offset
415 else:
416 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000417 if self.mode == WRITE:
418 if offset < self.offset:
419 raise IOError('Negative seek in write mode')
420 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000421 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000422 self.write(1024 * '\0')
423 self.write((count % 1024) * '\0')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000424 elif self.mode == READ:
425 if offset < self.offset:
426 # for negative seek, rewind and do positive seek
427 self.rewind()
428 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000429 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000430 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000431 self.read(count % 1024)
432
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000433 return self.offset
434
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000435 def readline(self, size=-1):
Bob Ippolitod82c3102006-05-22 15:59:12 +0000436 if size < 0:
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000437 # Shortcut common case - newline found in buffer.
438 offset = self.offset - self.extrastart
439 i = self.extrabuf.find('\n', offset) + 1
440 if i > 0:
441 self.extrasize -= i - offset
442 self.offset += i - offset
443 return self.extrabuf[offset: i]
444
Bob Ippolitod82c3102006-05-22 15:59:12 +0000445 size = sys.maxint
446 readsize = self.min_readsize
447 else:
448 readsize = size
Bob Ippolitob9759732006-05-22 15:22:46 +0000449 bufs = []
Bob Ippolitod82c3102006-05-22 15:59:12 +0000450 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000451 c = self.read(readsize)
Eric S. Raymondee5e61d2001-02-09 09:10:35 +0000452 i = c.find('\n')
Bob Ippolitod82c3102006-05-22 15:59:12 +0000453
454 # We set i=size to break out of the loop under two
455 # conditions: 1) there's no newline, and the chunk is
456 # larger than size, or 2) there is a newline, but the
457 # resulting line would be longer than 'size'.
458 if (size <= i) or (i == -1 and len(c) > size):
459 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000460
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000461 if i >= 0 or c == '':
Bob Ippolitod82c3102006-05-22 15:59:12 +0000462 bufs.append(c[:i + 1]) # Add portion of last chunk
463 self._unread(c[i + 1:]) # Push back rest of chunk
464 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000465
Bob Ippolitob9759732006-05-22 15:22:46 +0000466 # Append chunk to list, decrease 'size',
467 bufs.append(c)
468 size = size - len(c)
469 readsize = min(size, readsize * 2)
Bob Ippolitod82c3102006-05-22 15:59:12 +0000470 if readsize > self.min_readsize:
471 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
472 return ''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000473
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000474
475def _test():
476 # Act like gzip; with -d, act like gunzip.
477 # The input file is not deleted, however, nor are any other gzip
478 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000479 args = sys.argv[1:]
480 decompress = args and args[0] == "-d"
481 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000482 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000483 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000484 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000485 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000486 if decompress:
487 if arg == "-":
488 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
489 g = sys.stdout
490 else:
491 if arg[-3:] != ".gz":
Walter Dörwald70a6b492004-02-12 17:35:32 +0000492 print "filename doesn't end in .gz:", repr(arg)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000493 continue
494 f = open(arg, "rb")
495 g = __builtin__.open(arg[:-3], "wb")
496 else:
497 if arg == "-":
498 f = sys.stdin
499 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
500 else:
501 f = __builtin__.open(arg, "rb")
502 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000503 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000504 chunk = f.read(1024)
505 if not chunk:
506 break
507 g.write(chunk)
508 if g is not sys.stdout:
509 g.close()
510 if f is not sys.stdin:
511 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000512
513if __name__ == '__main__':
514 _test()