blob: 2ae7c0cffe3817d03a6dad2efcf4a85bca04309e [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Antoine Pitrou673ddf92010-01-03 22:29:56 +000010import io
Guido van Rossum68de3791997-07-19 20:22:23 +000011import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +000012
Skip Montanaro2dd42762001-01-23 15:35:05 +000013__all__ = ["GzipFile","open"]
14
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Guido van Rossum15262191997-04-30 16:04:57 +000024def read32(input):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +000025 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000026
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou673ddf92010-01-03 22:29:56 +000036class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000037 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000038 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000039
40 """
Guido van Rossum15262191997-04-30 16:04:57 +000041
Guido van Rossum68de3791997-07-19 20:22:23 +000042 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000043 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +000044
Tim Peters07e99cb2001-01-14 23:47:14 +000045 def __init__(self, filename=None, mode=None,
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000046 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000047 """Constructor for the GzipFile class.
48
49 At least one of fileobj and filename must be given a
50 non-trivial value.
51
52 The new class instance is based on fileobj, which can be a regular
53 file, a StringIO object, or any other object which simulates a file.
54 It defaults to None, in which case filename is opened to provide
55 a file object.
56
57 When fileobj is not None, the filename argument is only used to be
58 included in the gzip file header, which may includes the original
59 filename of the uncompressed file. It defaults to the filename of
60 fileobj, if discernible; otherwise, it defaults to the empty string,
61 and in this case the original filename is not included in the header.
62
63 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64 depending on whether the file will be read or written. The default
65 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66 Be aware that only the 'rb', 'ab', and 'wb' values should be used
67 for cross-platform portability.
68
69 The compresslevel argument is an integer from 1 to 9 controlling the
70 level of compression; 1 is fastest and produces the least compression,
71 and 9 is slowest and produces the most compression. The default is 9.
72
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000073 The mtime argument is an optional numeric timestamp to be written
74 to the stream when compressing. All gzip compressed streams
75 are required to contain a timestamp. If omitted or None, the
76 current time is used. This module ignores the timestamp when
77 decompressing; however, some programs, such as gunzip, make use
78 of it. The format of the timestamp is the same as that of the
79 return value of time.time() and of the st_mtime member of the
80 object returned by os.stat().
81
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000082 """
83
Nadeem Vawdadd72b3f2012-10-21 18:15:05 +020084 # Make sure we don't inadvertently enable universal newlines on the
85 # underlying file object - in read mode, this causes data corruption.
86 if mode:
87 mode = mode.replace('U', '')
Skip Montanaro12424bc2002-05-23 01:43:05 +000088 # guarantee the file is opened in binary mode on platforms
89 # that care about that sort of thing
90 if mode and 'b' not in mode:
91 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000092 if fileobj is None:
Fred Drake9bb76d11999-04-05 18:33:40 +000093 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +000094 if filename is None:
Nadeem Vawdad7664de2012-01-19 00:40:46 +020095 # Issue #13781: os.fdopen() creates a fileobj with a bogus name
96 # attribute. Avoid saving this in the gzip header's filename field.
97 if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
98 filename = fileobj.name
99 else:
100 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000101 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000102 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000103 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000104
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000105 if mode[0:1] == 'r':
106 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000107 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000108 self._new_member = True
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000109 # Buffer data read from gzip file. extrastart is offset in
110 # stream where buffer starts. extrasize is number of
111 # bytes remaining in buffer from current stream position.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000112 self.extrabuf = ""
113 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000114 self.extrastart = 0
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000115 self.name = filename
Bob Ippolitod82c3102006-05-22 15:59:12 +0000116 # Starts small, scales exponentially
117 self.min_readsize = 100
Guido van Rossum15262191997-04-30 16:04:57 +0000118
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000119 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000120 self.mode = WRITE
121 self._init_write(filename)
122 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000123 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000124 -zlib.MAX_WBITS,
125 zlib.DEF_MEM_LEVEL,
126 0)
127 else:
Martin v. Löwisdb044892002-03-11 06:46:52 +0000128 raise IOError, "Mode " + mode + " not supported"
Guido van Rossum15262191997-04-30 16:04:57 +0000129
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000131 self.offset = 0
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000132 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000133
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000134 if self.mode == WRITE:
135 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000136
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000137 @property
138 def filename(self):
139 import warnings
Philip Jenveyd846f1d2009-05-08 02:28:39 +0000140 warnings.warn("use the name attribute", DeprecationWarning, 2)
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000141 if self.mode == WRITE and self.name[-3:] != ".gz":
142 return self.name + ".gz"
143 return self.name
144
Guido van Rossum15262191997-04-30 16:04:57 +0000145 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000146 s = repr(self.fileobj)
147 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000148
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000149 def _check_closed(self):
150 """Raises a ValueError if the underlying file object has been closed.
151
152 """
153 if self.closed:
154 raise ValueError('I/O operation on closed file.')
155
Guido van Rossum15262191997-04-30 16:04:57 +0000156 def _init_write(self, filename):
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000157 self.name = filename
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000158 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 self.size = 0
160 self.writebuf = []
161 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000162
163 def _write_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000164 self.fileobj.write('\037\213') # magic header
165 self.fileobj.write('\010') # compression method
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +0000166 fname = os.path.basename(self.name)
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000167 if fname.endswith(".gz"):
168 fname = fname[:-3]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000169 flags = 0
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000170 if fname:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000171 flags = FNAME
172 self.fileobj.write(chr(flags))
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000173 mtime = self.mtime
174 if mtime is None:
175 mtime = time.time()
176 write32u(self.fileobj, long(mtime))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000177 self.fileobj.write('\002')
178 self.fileobj.write('\377')
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000179 if fname:
180 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000181
182 def _init_read(self):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000183 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000184 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000185
186 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000187 magic = self.fileobj.read(2)
188 if magic != '\037\213':
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000189 raise IOError, 'Not a gzipped file'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000190 method = ord( self.fileobj.read(1) )
191 if method != 8:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000192 raise IOError, 'Unknown compression method'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000193 flag = ord( self.fileobj.read(1) )
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000194 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 # extraflag = self.fileobj.read(1)
196 # os = self.fileobj.read(1)
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000197 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000198
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000199 if flag & FEXTRA:
200 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000201 xlen = ord(self.fileobj.read(1))
202 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000203 self.fileobj.read(xlen)
204 if flag & FNAME:
205 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000206 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000207 s = self.fileobj.read(1)
208 if not s or s=='\000':
209 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000210 if flag & FCOMMENT:
211 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000212 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000213 s = self.fileobj.read(1)
214 if not s or s=='\000':
215 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000216 if flag & FHCRC:
217 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000218
Guido van Rossum15262191997-04-30 16:04:57 +0000219 def write(self,data):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000220 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000221 if self.mode != WRITE:
222 import errno
223 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000224
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000225 if self.fileobj is None:
226 raise ValueError, "write() on closed GzipFile object"
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000227
228 # Convert data type if called by io.BufferedWriter.
229 if isinstance(data, memoryview):
230 data = data.tobytes()
231
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000232 if len(data) > 0:
233 self.size = self.size + len(data)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000234 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000235 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000236 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000237
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000238 return len(data)
239
Guido van Rossum56068012000-02-02 16:51:06 +0000240 def read(self, size=-1):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000241 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000242 if self.mode != READ:
243 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000244 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000245
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000246 if self.extrasize <= 0 and self.fileobj is None:
247 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000248
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000249 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000250 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000251 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000252 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000253 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000254 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000255 except EOFError:
256 size = self.extrasize
257 else: # just get some more of it
258 try:
259 while size > self.extrasize:
260 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000261 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000262 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000263 if size > self.extrasize:
264 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000265
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000266 offset = self.offset - self.extrastart
267 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000268 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000269
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000270 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000271 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000272
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000273 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000274 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000275 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000276
277 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000278 if self.fileobj is None:
279 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000280
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000281 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000282 # If the _new_member flag is set, we have to
283 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000284 #
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000285 # First, check if we're at the end of the file;
286 # if so, it's time to stop; no more members to read.
287 pos = self.fileobj.tell() # Save current position
288 self.fileobj.seek(0, 2) # Seek to end of file
289 if pos == self.fileobj.tell():
Andrew M. Kuchling2d813e51999-09-06 16:34:51 +0000290 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000291 else:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000292 self.fileobj.seek( pos ) # Return to original position
Tim Peters07e99cb2001-01-14 23:47:14 +0000293
294 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000295 self._read_gzip_header()
296 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000297 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000298
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000299 # Read a chunk of data from the file
300 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000301
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000302 # If the EOF has been reached, flush the decompression object
303 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000304
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000305 if buf == "":
306 uncompress = self.decompress.flush()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000307 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000308 self._add_read_data( uncompress )
309 raise EOFError, 'Reached EOF'
Tim Peters07e99cb2001-01-14 23:47:14 +0000310
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000311 uncompress = self.decompress.decompress(buf)
312 self._add_read_data( uncompress )
313
314 if self.decompress.unused_data != "":
315 # Ending case: we've come to the end of a member in the file,
316 # so seek back to the start of the unused data, finish up
317 # this member, and read a new gzip header.
318 # (The number of bytes to seek back is the length of the unused
319 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
320 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
321
322 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000323 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000324 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000325 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000326
327 def _add_read_data(self, data):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000328 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000329 offset = self.offset - self.extrastart
330 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000331 self.extrasize = self.extrasize + len(data)
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000332 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000333 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000334
335 def _read_eof(self):
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000336 # We've read to the end of the file, so we have to rewind in order
Tim Peters07e99cb2001-01-14 23:47:14 +0000337 # to reread the 8 bytes containing the CRC and the file size.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000338 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000339 # uncompressed data matches the stored values. Note that the size
340 # stored is the true file size mod 2**32.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000341 self.fileobj.seek(-8, 1)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000342 crc32 = read32(self.fileobj)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000343 isize = read32(self.fileobj) # may exceed 2GB
344 if crc32 != self.crc:
345 raise IOError("CRC check failed %s != %s" % (hex(crc32),
346 hex(self.crc)))
Gregory P. Smithac830e92008-03-23 23:43:02 +0000347 elif isize != (self.size & 0xffffffffL):
Andrew M. Kuchling64edd6a2003-02-05 21:35:07 +0000348 raise IOError, "Incorrect length of data produced"
Tim Peters07e99cb2001-01-14 23:47:14 +0000349
Antoine Pitrou5a9112c2010-01-13 14:32:10 +0000350 # Gzip files can be padded with zeroes and still have archives.
351 # Consume all zero bytes and set the file position to the first
352 # non-zero byte. See http://www.gzip.org/#faq8
353 c = "\x00"
354 while c == "\x00":
355 c = self.fileobj.read(1)
356 if c:
357 self.fileobj.seek(-1, 1)
358
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000359 @property
360 def closed(self):
361 return self.fileobj is None
362
Guido van Rossum15262191997-04-30 16:04:57 +0000363 def close(self):
Georg Brandle08e3d02008-05-25 08:07:37 +0000364 if self.fileobj is None:
365 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000366 if self.mode == WRITE:
367 self.fileobj.write(self.compress.flush())
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000368 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000369 # self.size may exceed 2GB, or even 4GB
Gregory P. Smithdd102842008-03-23 23:45:12 +0000370 write32u(self.fileobj, self.size & 0xffffffffL)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000371 self.fileobj = None
372 elif self.mode == READ:
373 self.fileobj = None
374 if self.myfileobj:
375 self.myfileobj.close()
376 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000377
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000378 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000379 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000380 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000381 # Ensure the compressor's buffer is flushed
382 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinson3b0b4ff2010-05-04 18:45:27 +0000383 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000384
Tim Peters5cfb05e2004-07-27 21:02:02 +0000385 def fileno(self):
386 """Invoke the underlying file object's fileno() method.
387
388 This will raise AttributeError if the underlying file object
389 doesn't support fileno().
390 """
391 return self.fileobj.fileno()
392
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000393 def rewind(self):
394 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000395 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000396 if self.mode != READ:
397 raise IOError("Can't rewind in write mode")
398 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000399 self._new_member = True
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000400 self.extrabuf = ""
401 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000402 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000403 self.offset = 0
404
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000405 def readable(self):
406 return self.mode == READ
407
408 def writable(self):
409 return self.mode == WRITE
410
411 def seekable(self):
412 return True
413
Martin v. Löwis065f0c82006-11-12 10:41:39 +0000414 def seek(self, offset, whence=0):
415 if whence:
416 if whence == 1:
417 offset = self.offset + offset
418 else:
419 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000420 if self.mode == WRITE:
421 if offset < self.offset:
422 raise IOError('Negative seek in write mode')
423 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000424 for i in range(count // 1024):
425 self.write(1024 * '\0')
426 self.write((count % 1024) * '\0')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000427 elif self.mode == READ:
428 if offset < self.offset:
429 # for negative seek, rewind and do positive seek
430 self.rewind()
431 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000432 for i in range(count // 1024):
433 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000434 self.read(count % 1024)
435
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000436 return self.offset
437
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000438 def readline(self, size=-1):
Bob Ippolitod82c3102006-05-22 15:59:12 +0000439 if size < 0:
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000440 # Shortcut common case - newline found in buffer.
441 offset = self.offset - self.extrastart
442 i = self.extrabuf.find('\n', offset) + 1
443 if i > 0:
444 self.extrasize -= i - offset
445 self.offset += i - offset
446 return self.extrabuf[offset: i]
447
Bob Ippolitod82c3102006-05-22 15:59:12 +0000448 size = sys.maxint
449 readsize = self.min_readsize
450 else:
451 readsize = size
Bob Ippolitob9759732006-05-22 15:22:46 +0000452 bufs = []
Bob Ippolitod82c3102006-05-22 15:59:12 +0000453 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000454 c = self.read(readsize)
Eric S. Raymondee5e61d2001-02-09 09:10:35 +0000455 i = c.find('\n')
Bob Ippolitod82c3102006-05-22 15:59:12 +0000456
457 # We set i=size to break out of the loop under two
458 # conditions: 1) there's no newline, and the chunk is
459 # larger than size, or 2) there is a newline, but the
460 # resulting line would be longer than 'size'.
461 if (size <= i) or (i == -1 and len(c) > size):
462 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000463
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000464 if i >= 0 or c == '':
Bob Ippolitod82c3102006-05-22 15:59:12 +0000465 bufs.append(c[:i + 1]) # Add portion of last chunk
466 self._unread(c[i + 1:]) # Push back rest of chunk
467 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000468
Bob Ippolitob9759732006-05-22 15:22:46 +0000469 # Append chunk to list, decrease 'size',
470 bufs.append(c)
471 size = size - len(c)
472 readsize = min(size, readsize * 2)
Bob Ippolitod82c3102006-05-22 15:59:12 +0000473 if readsize > self.min_readsize:
474 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
475 return ''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000476
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000477
478def _test():
479 # Act like gzip; with -d, act like gunzip.
480 # The input file is not deleted, however, nor are any other gzip
481 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000482 args = sys.argv[1:]
483 decompress = args and args[0] == "-d"
484 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000485 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000486 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000487 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000488 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000489 if decompress:
490 if arg == "-":
491 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
492 g = sys.stdout
493 else:
494 if arg[-3:] != ".gz":
Walter Dörwald70a6b492004-02-12 17:35:32 +0000495 print "filename doesn't end in .gz:", repr(arg)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000496 continue
497 f = open(arg, "rb")
498 g = __builtin__.open(arg[:-3], "wb")
499 else:
500 if arg == "-":
501 f = sys.stdin
502 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
503 else:
504 f = __builtin__.open(arg, "rb")
505 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000506 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000507 chunk = f.read(1024)
508 if not chunk:
509 break
510 g.write(chunk)
511 if g is not sys.stdout:
512 g.close()
513 if f is not sys.stdin:
514 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000515
516if __name__ == '__main__':
517 _test()