blob: 83cc0773d779de099c559450d30bf10eabdf1a41 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Antoine Pitrou673ddf92010-01-03 22:29:56 +000010import io
Guido van Rossum68de3791997-07-19 20:22:23 +000011import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +000012
Skip Montanaro2dd42762001-01-23 15:35:05 +000013__all__ = ["GzipFile","open"]
14
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Benjamin Peterson6e165b42013-05-11 13:17:13 -050024def read32(input):
25 return struct.unpack("<I", input.read(4))[0]
26
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou673ddf92010-01-03 22:29:56 +000036class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000037 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000038 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000039
40 """
Guido van Rossum15262191997-04-30 16:04:57 +000041
Guido van Rossum68de3791997-07-19 20:22:23 +000042 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000043 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +000044
Tim Peters07e99cb2001-01-14 23:47:14 +000045 def __init__(self, filename=None, mode=None,
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000046 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000047 """Constructor for the GzipFile class.
48
49 At least one of fileobj and filename must be given a
50 non-trivial value.
51
52 The new class instance is based on fileobj, which can be a regular
53 file, a StringIO object, or any other object which simulates a file.
54 It defaults to None, in which case filename is opened to provide
55 a file object.
56
57 When fileobj is not None, the filename argument is only used to be
58 included in the gzip file header, which may includes the original
59 filename of the uncompressed file. It defaults to the filename of
60 fileobj, if discernible; otherwise, it defaults to the empty string,
61 and in this case the original filename is not included in the header.
62
63 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64 depending on whether the file will be read or written. The default
65 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66 Be aware that only the 'rb', 'ab', and 'wb' values should be used
67 for cross-platform portability.
68
Nadeem Vawda04050b82012-11-11 13:52:10 +010069 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000070 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda04050b82012-11-11 13:52:10 +010071 and 9 is slowest and produces the most compression. 0 is no compression
72 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000073
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000074 The mtime argument is an optional numeric timestamp to be written
75 to the stream when compressing. All gzip compressed streams
76 are required to contain a timestamp. If omitted or None, the
77 current time is used. This module ignores the timestamp when
78 decompressing; however, some programs, such as gunzip, make use
79 of it. The format of the timestamp is the same as that of the
80 return value of time.time() and of the st_mtime member of the
81 object returned by os.stat().
82
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000083 """
84
Nadeem Vawdadd72b3f2012-10-21 18:15:05 +020085 # Make sure we don't inadvertently enable universal newlines on the
86 # underlying file object - in read mode, this causes data corruption.
87 if mode:
88 mode = mode.replace('U', '')
Skip Montanaro12424bc2002-05-23 01:43:05 +000089 # guarantee the file is opened in binary mode on platforms
90 # that care about that sort of thing
91 if mode and 'b' not in mode:
92 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 if fileobj is None:
Fred Drake9bb76d11999-04-05 18:33:40 +000094 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +000095 if filename is None:
Nadeem Vawdad7664de2012-01-19 00:40:46 +020096 # Issue #13781: os.fdopen() creates a fileobj with a bogus name
97 # attribute. Avoid saving this in the gzip header's filename field.
98 if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
99 filename = fileobj.name
100 else:
101 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000102 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000104 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000105
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000106 if mode[0:1] == 'r':
107 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000108 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000109 self._new_member = True
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000110 # Buffer data read from gzip file. extrastart is offset in
111 # stream where buffer starts. extrasize is number of
112 # bytes remaining in buffer from current stream position.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000113 self.extrabuf = ""
114 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000115 self.extrastart = 0
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000116 self.name = filename
Bob Ippolitod82c3102006-05-22 15:59:12 +0000117 # Starts small, scales exponentially
118 self.min_readsize = 100
Guido van Rossum15262191997-04-30 16:04:57 +0000119
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000120 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000121 self.mode = WRITE
122 self._init_write(filename)
123 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000124 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000125 -zlib.MAX_WBITS,
126 zlib.DEF_MEM_LEVEL,
127 0)
128 else:
Martin v. Löwisdb044892002-03-11 06:46:52 +0000129 raise IOError, "Mode " + mode + " not supported"
Guido van Rossum15262191997-04-30 16:04:57 +0000130
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000131 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000132 self.offset = 0
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000133 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000134
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000135 if self.mode == WRITE:
136 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000137
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000138 @property
139 def filename(self):
140 import warnings
Philip Jenveyd846f1d2009-05-08 02:28:39 +0000141 warnings.warn("use the name attribute", DeprecationWarning, 2)
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000142 if self.mode == WRITE and self.name[-3:] != ".gz":
143 return self.name + ".gz"
144 return self.name
145
Guido van Rossum15262191997-04-30 16:04:57 +0000146 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000147 s = repr(self.fileobj)
148 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000149
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000150 def _check_closed(self):
151 """Raises a ValueError if the underlying file object has been closed.
152
153 """
154 if self.closed:
155 raise ValueError('I/O operation on closed file.')
156
Guido van Rossum15262191997-04-30 16:04:57 +0000157 def _init_write(self, filename):
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000158 self.name = filename
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000159 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000160 self.size = 0
161 self.writebuf = []
162 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000163
164 def _write_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000165 self.fileobj.write('\037\213') # magic header
166 self.fileobj.write('\010') # compression method
Serhiy Storchaka54edfb32014-10-12 22:23:28 +0300167 try:
168 # RFC 1952 requires the FNAME field to be Latin-1. Do not
169 # include filenames that cannot be represented that way.
170 fname = os.path.basename(self.name)
171 if not isinstance(fname, str):
172 fname = fname.encode('latin-1')
173 if fname.endswith('.gz'):
174 fname = fname[:-3]
175 except UnicodeEncodeError:
176 fname = ''
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000177 flags = 0
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000178 if fname:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000179 flags = FNAME
180 self.fileobj.write(chr(flags))
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000181 mtime = self.mtime
182 if mtime is None:
183 mtime = time.time()
184 write32u(self.fileobj, long(mtime))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 self.fileobj.write('\002')
186 self.fileobj.write('\377')
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000187 if fname:
188 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000189
190 def _init_read(self):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000191 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000192 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000193
194 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 magic = self.fileobj.read(2)
196 if magic != '\037\213':
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000197 raise IOError, 'Not a gzipped file'
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500198 method = ord( self.fileobj.read(1) )
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000199 if method != 8:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000200 raise IOError, 'Unknown compression method'
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500201 flag = ord( self.fileobj.read(1) )
202 self.mtime = read32(self.fileobj)
203 # extraflag = self.fileobj.read(1)
204 # os = self.fileobj.read(1)
205 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000206
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000207 if flag & FEXTRA:
208 # Read & discard the extra field, if present
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500209 xlen = ord(self.fileobj.read(1))
210 xlen = xlen + 256*ord(self.fileobj.read(1))
211 self.fileobj.read(xlen)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000212 if flag & FNAME:
213 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000214 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000215 s = self.fileobj.read(1)
216 if not s or s=='\000':
217 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000218 if flag & FCOMMENT:
219 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000220 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000221 s = self.fileobj.read(1)
222 if not s or s=='\000':
223 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000224 if flag & FHCRC:
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500225 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000226
Guido van Rossum15262191997-04-30 16:04:57 +0000227 def write(self,data):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000228 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000229 if self.mode != WRITE:
230 import errno
231 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000232
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000233 if self.fileobj is None:
234 raise ValueError, "write() on closed GzipFile object"
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000235
236 # Convert data type if called by io.BufferedWriter.
237 if isinstance(data, memoryview):
238 data = data.tobytes()
239
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000240 if len(data) > 0:
Serhiy Storchakaf689f102015-03-23 15:25:18 +0200241 self.fileobj.write(self.compress.compress(data))
242 self.size += len(data)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000243 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000244 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000245
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000246 return len(data)
247
Guido van Rossum56068012000-02-02 16:51:06 +0000248 def read(self, size=-1):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000249 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000250 if self.mode != READ:
251 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000252 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000253
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 if self.extrasize <= 0 and self.fileobj is None:
255 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000256
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000257 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000258 if size < 0: # get the whole thing
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500259 try:
260 while True:
261 self._read(readsize)
262 readsize = min(self.max_read_chunk, readsize * 2)
263 except EOFError:
264 size = self.extrasize
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000265 else: # just get some more of it
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500266 try:
267 while size > self.extrasize:
268 self._read(readsize)
269 readsize = min(self.max_read_chunk, readsize * 2)
270 except EOFError:
271 if size > self.extrasize:
272 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000273
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000274 offset = self.offset - self.extrastart
275 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000277
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000278 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000279 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000280
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000281 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000282 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000283 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000284
285 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000286 if self.fileobj is None:
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500287 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000288
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000289 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000290 # If the _new_member flag is set, we have to
291 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000292 #
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000293 # First, check if we're at the end of the file;
294 # if so, it's time to stop; no more members to read.
295 pos = self.fileobj.tell() # Save current position
296 self.fileobj.seek(0, 2) # Seek to end of file
297 if pos == self.fileobj.tell():
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500298 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000299 else:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000300 self.fileobj.seek( pos ) # Return to original position
Tim Peters07e99cb2001-01-14 23:47:14 +0000301
302 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000303 self._read_gzip_header()
304 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000305 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000306
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000307 # Read a chunk of data from the file
308 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000309
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000310 # If the EOF has been reached, flush the decompression object
311 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000312
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000313 if buf == "":
314 uncompress = self.decompress.flush()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000315 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000316 self._add_read_data( uncompress )
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500317 raise EOFError, 'Reached EOF'
Tim Peters07e99cb2001-01-14 23:47:14 +0000318
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000319 uncompress = self.decompress.decompress(buf)
320 self._add_read_data( uncompress )
321
322 if self.decompress.unused_data != "":
323 # Ending case: we've come to the end of a member in the file,
324 # so seek back to the start of the unused data, finish up
325 # this member, and read a new gzip header.
326 # (The number of bytes to seek back is the length of the unused
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500327 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
328 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000329
330 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000331 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000332 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000333 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000334
335 def _add_read_data(self, data):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000336 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000337 offset = self.offset - self.extrastart
338 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000339 self.extrasize = self.extrasize + len(data)
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000340 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000341 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000342
343 def _read_eof(self):
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500344 # We've read to the end of the file, so we have to rewind in order
345 # to reread the 8 bytes containing the CRC and the file size.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000346 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000347 # uncompressed data matches the stored values. Note that the size
348 # stored is the true file size mod 2**32.
Benjamin Peterson6e165b42013-05-11 13:17:13 -0500349 self.fileobj.seek(-8, 1)
350 crc32 = read32(self.fileobj)
351 isize = read32(self.fileobj) # may exceed 2GB
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000352 if crc32 != self.crc:
353 raise IOError("CRC check failed %s != %s" % (hex(crc32),
354 hex(self.crc)))
Gregory P. Smithac830e92008-03-23 23:43:02 +0000355 elif isize != (self.size & 0xffffffffL):
Andrew M. Kuchling64edd6a2003-02-05 21:35:07 +0000356 raise IOError, "Incorrect length of data produced"
Tim Peters07e99cb2001-01-14 23:47:14 +0000357
Antoine Pitrou5a9112c2010-01-13 14:32:10 +0000358 # Gzip files can be padded with zeroes and still have archives.
359 # Consume all zero bytes and set the file position to the first
360 # non-zero byte. See http://www.gzip.org/#faq8
361 c = "\x00"
362 while c == "\x00":
363 c = self.fileobj.read(1)
364 if c:
365 self.fileobj.seek(-1, 1)
366
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000367 @property
368 def closed(self):
369 return self.fileobj is None
370
Guido van Rossum15262191997-04-30 16:04:57 +0000371 def close(self):
Georg Brandle08e3d02008-05-25 08:07:37 +0000372 if self.fileobj is None:
373 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000374 if self.mode == WRITE:
375 self.fileobj.write(self.compress.flush())
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000376 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000377 # self.size may exceed 2GB, or even 4GB
Gregory P. Smithdd102842008-03-23 23:45:12 +0000378 write32u(self.fileobj, self.size & 0xffffffffL)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000379 self.fileobj = None
380 elif self.mode == READ:
381 self.fileobj = None
382 if self.myfileobj:
383 self.myfileobj.close()
384 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000385
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000386 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000387 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000388 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000389 # Ensure the compressor's buffer is flushed
390 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinson3b0b4ff2010-05-04 18:45:27 +0000391 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000392
Tim Peters5cfb05e2004-07-27 21:02:02 +0000393 def fileno(self):
394 """Invoke the underlying file object's fileno() method.
395
396 This will raise AttributeError if the underlying file object
397 doesn't support fileno().
398 """
399 return self.fileobj.fileno()
400
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000401 def rewind(self):
402 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000403 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000404 if self.mode != READ:
405 raise IOError("Can't rewind in write mode")
406 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000407 self._new_member = True
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000408 self.extrabuf = ""
409 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000410 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000411 self.offset = 0
412
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000413 def readable(self):
414 return self.mode == READ
415
416 def writable(self):
417 return self.mode == WRITE
418
419 def seekable(self):
420 return True
421
Martin v. Löwis065f0c82006-11-12 10:41:39 +0000422 def seek(self, offset, whence=0):
423 if whence:
424 if whence == 1:
425 offset = self.offset + offset
426 else:
427 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000428 if self.mode == WRITE:
429 if offset < self.offset:
430 raise IOError('Negative seek in write mode')
431 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000432 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000433 self.write(1024 * '\0')
434 self.write((count % 1024) * '\0')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000435 elif self.mode == READ:
436 if offset < self.offset:
437 # for negative seek, rewind and do positive seek
438 self.rewind()
439 count = offset - self.offset
Chris Withers2cc0b072012-11-09 15:48:17 +0000440 for i in xrange(count // 1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000441 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000442 self.read(count % 1024)
443
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000444 return self.offset
445
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000446 def readline(self, size=-1):
Bob Ippolitod82c3102006-05-22 15:59:12 +0000447 if size < 0:
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000448 # Shortcut common case - newline found in buffer.
449 offset = self.offset - self.extrastart
450 i = self.extrabuf.find('\n', offset) + 1
451 if i > 0:
452 self.extrasize -= i - offset
453 self.offset += i - offset
454 return self.extrabuf[offset: i]
455
Bob Ippolitod82c3102006-05-22 15:59:12 +0000456 size = sys.maxint
457 readsize = self.min_readsize
458 else:
459 readsize = size
Bob Ippolitob9759732006-05-22 15:22:46 +0000460 bufs = []
Bob Ippolitod82c3102006-05-22 15:59:12 +0000461 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000462 c = self.read(readsize)
Eric S. Raymondee5e61d2001-02-09 09:10:35 +0000463 i = c.find('\n')
Bob Ippolitod82c3102006-05-22 15:59:12 +0000464
465 # We set i=size to break out of the loop under two
466 # conditions: 1) there's no newline, and the chunk is
467 # larger than size, or 2) there is a newline, but the
468 # resulting line would be longer than 'size'.
469 if (size <= i) or (i == -1 and len(c) > size):
470 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000471
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000472 if i >= 0 or c == '':
Bob Ippolitod82c3102006-05-22 15:59:12 +0000473 bufs.append(c[:i + 1]) # Add portion of last chunk
474 self._unread(c[i + 1:]) # Push back rest of chunk
475 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000476
Bob Ippolitob9759732006-05-22 15:22:46 +0000477 # Append chunk to list, decrease 'size',
478 bufs.append(c)
479 size = size - len(c)
480 readsize = min(size, readsize * 2)
Bob Ippolitod82c3102006-05-22 15:59:12 +0000481 if readsize > self.min_readsize:
482 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
483 return ''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000484
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000485
486def _test():
487 # Act like gzip; with -d, act like gunzip.
488 # The input file is not deleted, however, nor are any other gzip
489 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000490 args = sys.argv[1:]
491 decompress = args and args[0] == "-d"
492 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000493 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000494 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000495 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000496 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000497 if decompress:
498 if arg == "-":
499 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
500 g = sys.stdout
501 else:
502 if arg[-3:] != ".gz":
Walter Dörwald70a6b492004-02-12 17:35:32 +0000503 print "filename doesn't end in .gz:", repr(arg)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000504 continue
505 f = open(arg, "rb")
506 g = __builtin__.open(arg[:-3], "wb")
507 else:
508 if arg == "-":
509 f = sys.stdin
510 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
511 else:
512 f = __builtin__.open(arg, "rb")
513 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000514 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000515 chunk = f.read(1024)
516 if not chunk:
517 break
518 g.write(chunk)
519 if g is not sys.stdout:
520 g.close()
521 if f is not sys.stdin:
522 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000523
524if __name__ == '__main__':
525 _test()