blob: 2bcb4dbfb0b386e85c1f0705ffa8c0c4a2328d83 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Antoine Pitrou673ddf92010-01-03 22:29:56 +000010import io
Guido van Rossum68de3791997-07-19 20:22:23 +000011import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +000012
Skip Montanaro2dd42762001-01-23 15:35:05 +000013__all__ = ["GzipFile","open"]
14
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Guido van Rossum15262191997-04-30 16:04:57 +000024def read32(input):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +000025 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000026
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou673ddf92010-01-03 22:29:56 +000036class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000037 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000038 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000039
40 """
Guido van Rossum15262191997-04-30 16:04:57 +000041
Guido van Rossum68de3791997-07-19 20:22:23 +000042 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000043 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +000044
Tim Peters07e99cb2001-01-14 23:47:14 +000045 def __init__(self, filename=None, mode=None,
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000046 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000047 """Constructor for the GzipFile class.
48
49 At least one of fileobj and filename must be given a
50 non-trivial value.
51
52 The new class instance is based on fileobj, which can be a regular
53 file, a StringIO object, or any other object which simulates a file.
54 It defaults to None, in which case filename is opened to provide
55 a file object.
56
57 When fileobj is not None, the filename argument is only used to be
58 included in the gzip file header, which may includes the original
59 filename of the uncompressed file. It defaults to the filename of
60 fileobj, if discernible; otherwise, it defaults to the empty string,
61 and in this case the original filename is not included in the header.
62
63 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64 depending on whether the file will be read or written. The default
65 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66 Be aware that only the 'rb', 'ab', and 'wb' values should be used
67 for cross-platform portability.
68
69 The compresslevel argument is an integer from 1 to 9 controlling the
70 level of compression; 1 is fastest and produces the least compression,
71 and 9 is slowest and produces the most compression. The default is 9.
72
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000073 The mtime argument is an optional numeric timestamp to be written
74 to the stream when compressing. All gzip compressed streams
75 are required to contain a timestamp. If omitted or None, the
76 current time is used. This module ignores the timestamp when
77 decompressing; however, some programs, such as gunzip, make use
78 of it. The format of the timestamp is the same as that of the
79 return value of time.time() and of the st_mtime member of the
80 object returned by os.stat().
81
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000082 """
83
Skip Montanaro12424bc2002-05-23 01:43:05 +000084 # guarantee the file is opened in binary mode on platforms
85 # that care about that sort of thing
86 if mode and 'b' not in mode:
87 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000088 if fileobj is None:
Fred Drake9bb76d11999-04-05 18:33:40 +000089 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +000090 if filename is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000091 if hasattr(fileobj, 'name'): filename = fileobj.name
92 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +000093 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000094 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +000095 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +000096
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000097 if mode[0:1] == 'r':
98 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +000099 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000100 self._new_member = True
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000101 # Buffer data read from gzip file. extrastart is offset in
102 # stream where buffer starts. extrasize is number of
103 # bytes remaining in buffer from current stream position.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000104 self.extrabuf = ""
105 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000106 self.extrastart = 0
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000107 self.name = filename
Bob Ippolitod82c3102006-05-22 15:59:12 +0000108 # Starts small, scales exponentially
109 self.min_readsize = 100
Guido van Rossum15262191997-04-30 16:04:57 +0000110
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000111 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000112 self.mode = WRITE
113 self._init_write(filename)
114 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000115 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000116 -zlib.MAX_WBITS,
117 zlib.DEF_MEM_LEVEL,
118 0)
119 else:
Martin v. Löwisdb044892002-03-11 06:46:52 +0000120 raise IOError, "Mode " + mode + " not supported"
Guido van Rossum15262191997-04-30 16:04:57 +0000121
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000122 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000123 self.offset = 0
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000124 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000125
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000126 if self.mode == WRITE:
127 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000128
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000129 @property
130 def filename(self):
131 import warnings
Philip Jenveyd846f1d2009-05-08 02:28:39 +0000132 warnings.warn("use the name attribute", DeprecationWarning, 2)
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000133 if self.mode == WRITE and self.name[-3:] != ".gz":
134 return self.name + ".gz"
135 return self.name
136
Guido van Rossum15262191997-04-30 16:04:57 +0000137 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000138 s = repr(self.fileobj)
139 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000140
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000141 def _check_closed(self):
142 """Raises a ValueError if the underlying file object has been closed.
143
144 """
145 if self.closed:
146 raise ValueError('I/O operation on closed file.')
147
Guido van Rossum15262191997-04-30 16:04:57 +0000148 def _init_write(self, filename):
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000149 self.name = filename
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000150 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000151 self.size = 0
152 self.writebuf = []
153 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000154
155 def _write_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000156 self.fileobj.write('\037\213') # magic header
157 self.fileobj.write('\010') # compression method
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +0000158 fname = os.path.basename(self.name)
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000159 if fname.endswith(".gz"):
160 fname = fname[:-3]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000161 flags = 0
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000162 if fname:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000163 flags = FNAME
164 self.fileobj.write(chr(flags))
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000165 mtime = self.mtime
166 if mtime is None:
167 mtime = time.time()
168 write32u(self.fileobj, long(mtime))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000169 self.fileobj.write('\002')
170 self.fileobj.write('\377')
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000171 if fname:
172 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000173
174 def _init_read(self):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000175 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000176 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000177
178 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000179 magic = self.fileobj.read(2)
180 if magic != '\037\213':
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000181 raise IOError, 'Not a gzipped file'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000182 method = ord( self.fileobj.read(1) )
183 if method != 8:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000184 raise IOError, 'Unknown compression method'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 flag = ord( self.fileobj.read(1) )
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000186 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000187 # extraflag = self.fileobj.read(1)
188 # os = self.fileobj.read(1)
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000189 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000190
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000191 if flag & FEXTRA:
192 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000193 xlen = ord(self.fileobj.read(1))
194 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 self.fileobj.read(xlen)
196 if flag & FNAME:
197 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000198 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000199 s = self.fileobj.read(1)
200 if not s or s=='\000':
201 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000202 if flag & FCOMMENT:
203 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000204 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000205 s = self.fileobj.read(1)
206 if not s or s=='\000':
207 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000208 if flag & FHCRC:
209 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000210
Guido van Rossum15262191997-04-30 16:04:57 +0000211 def write(self,data):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000212 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000213 if self.mode != WRITE:
214 import errno
215 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000216
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000217 if self.fileobj is None:
218 raise ValueError, "write() on closed GzipFile object"
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000219
220 # Convert data type if called by io.BufferedWriter.
221 if isinstance(data, memoryview):
222 data = data.tobytes()
223
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000224 if len(data) > 0:
225 self.size = self.size + len(data)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000226 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000227 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000228 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000229
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000230 return len(data)
231
Guido van Rossum56068012000-02-02 16:51:06 +0000232 def read(self, size=-1):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000233 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000234 if self.mode != READ:
235 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000236 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000237
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000238 if self.extrasize <= 0 and self.fileobj is None:
239 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000240
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000241 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000242 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000243 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000244 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000245 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000246 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000247 except EOFError:
248 size = self.extrasize
249 else: # just get some more of it
250 try:
251 while size > self.extrasize:
252 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000253 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000255 if size > self.extrasize:
256 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000257
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000258 offset = self.offset - self.extrastart
259 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000260 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000261
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000262 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000263 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000264
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000265 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000266 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000267 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000268
269 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000270 if self.fileobj is None:
271 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000272
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000273 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000274 # If the _new_member flag is set, we have to
275 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000276 #
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000277 # First, check if we're at the end of the file;
278 # if so, it's time to stop; no more members to read.
279 pos = self.fileobj.tell() # Save current position
280 self.fileobj.seek(0, 2) # Seek to end of file
281 if pos == self.fileobj.tell():
Andrew M. Kuchling2d813e51999-09-06 16:34:51 +0000282 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000283 else:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000284 self.fileobj.seek( pos ) # Return to original position
Tim Peters07e99cb2001-01-14 23:47:14 +0000285
286 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000287 self._read_gzip_header()
288 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000289 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000290
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000291 # Read a chunk of data from the file
292 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000293
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000294 # If the EOF has been reached, flush the decompression object
295 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000296
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000297 if buf == "":
298 uncompress = self.decompress.flush()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000299 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000300 self._add_read_data( uncompress )
301 raise EOFError, 'Reached EOF'
Tim Peters07e99cb2001-01-14 23:47:14 +0000302
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000303 uncompress = self.decompress.decompress(buf)
304 self._add_read_data( uncompress )
305
306 if self.decompress.unused_data != "":
307 # Ending case: we've come to the end of a member in the file,
308 # so seek back to the start of the unused data, finish up
309 # this member, and read a new gzip header.
310 # (The number of bytes to seek back is the length of the unused
311 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
312 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
313
314 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000315 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000316 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000317 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000318
319 def _add_read_data(self, data):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000320 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000321 offset = self.offset - self.extrastart
322 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000323 self.extrasize = self.extrasize + len(data)
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000324 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000325 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000326
327 def _read_eof(self):
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000328 # We've read to the end of the file, so we have to rewind in order
Tim Peters07e99cb2001-01-14 23:47:14 +0000329 # to reread the 8 bytes containing the CRC and the file size.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000330 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000331 # uncompressed data matches the stored values. Note that the size
332 # stored is the true file size mod 2**32.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000333 self.fileobj.seek(-8, 1)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000334 crc32 = read32(self.fileobj)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000335 isize = read32(self.fileobj) # may exceed 2GB
336 if crc32 != self.crc:
337 raise IOError("CRC check failed %s != %s" % (hex(crc32),
338 hex(self.crc)))
Gregory P. Smithac830e92008-03-23 23:43:02 +0000339 elif isize != (self.size & 0xffffffffL):
Andrew M. Kuchling64edd6a2003-02-05 21:35:07 +0000340 raise IOError, "Incorrect length of data produced"
Tim Peters07e99cb2001-01-14 23:47:14 +0000341
Antoine Pitrou5a9112c2010-01-13 14:32:10 +0000342 # Gzip files can be padded with zeroes and still have archives.
343 # Consume all zero bytes and set the file position to the first
344 # non-zero byte. See http://www.gzip.org/#faq8
345 c = "\x00"
346 while c == "\x00":
347 c = self.fileobj.read(1)
348 if c:
349 self.fileobj.seek(-1, 1)
350
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000351 @property
352 def closed(self):
353 return self.fileobj is None
354
Guido van Rossum15262191997-04-30 16:04:57 +0000355 def close(self):
Georg Brandle08e3d02008-05-25 08:07:37 +0000356 if self.fileobj is None:
357 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000358 if self.mode == WRITE:
359 self.fileobj.write(self.compress.flush())
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000360 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000361 # self.size may exceed 2GB, or even 4GB
Gregory P. Smithdd102842008-03-23 23:45:12 +0000362 write32u(self.fileobj, self.size & 0xffffffffL)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000363 self.fileobj = None
364 elif self.mode == READ:
365 self.fileobj = None
366 if self.myfileobj:
367 self.myfileobj.close()
368 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000369
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000370 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou76a66aa2010-10-06 21:26:52 +0000371 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000372 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000373 # Ensure the compressor's buffer is flushed
374 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinson3b0b4ff2010-05-04 18:45:27 +0000375 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000376
Tim Peters5cfb05e2004-07-27 21:02:02 +0000377 def fileno(self):
378 """Invoke the underlying file object's fileno() method.
379
380 This will raise AttributeError if the underlying file object
381 doesn't support fileno().
382 """
383 return self.fileobj.fileno()
384
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000385 def rewind(self):
386 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000387 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000388 if self.mode != READ:
389 raise IOError("Can't rewind in write mode")
390 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000391 self._new_member = True
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000392 self.extrabuf = ""
393 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000394 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000395 self.offset = 0
396
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000397 def readable(self):
398 return self.mode == READ
399
400 def writable(self):
401 return self.mode == WRITE
402
403 def seekable(self):
404 return True
405
Martin v. Löwis065f0c82006-11-12 10:41:39 +0000406 def seek(self, offset, whence=0):
407 if whence:
408 if whence == 1:
409 offset = self.offset + offset
410 else:
411 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000412 if self.mode == WRITE:
413 if offset < self.offset:
414 raise IOError('Negative seek in write mode')
415 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000416 for i in range(count // 1024):
417 self.write(1024 * '\0')
418 self.write((count % 1024) * '\0')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000419 elif self.mode == READ:
420 if offset < self.offset:
421 # for negative seek, rewind and do positive seek
422 self.rewind()
423 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000424 for i in range(count // 1024):
425 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000426 self.read(count % 1024)
427
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000428 return self.offset
429
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000430 def readline(self, size=-1):
Bob Ippolitod82c3102006-05-22 15:59:12 +0000431 if size < 0:
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000432 # Shortcut common case - newline found in buffer.
433 offset = self.offset - self.extrastart
434 i = self.extrabuf.find('\n', offset) + 1
435 if i > 0:
436 self.extrasize -= i - offset
437 self.offset += i - offset
438 return self.extrabuf[offset: i]
439
Bob Ippolitod82c3102006-05-22 15:59:12 +0000440 size = sys.maxint
441 readsize = self.min_readsize
442 else:
443 readsize = size
Bob Ippolitob9759732006-05-22 15:22:46 +0000444 bufs = []
Bob Ippolitod82c3102006-05-22 15:59:12 +0000445 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000446 c = self.read(readsize)
Eric S. Raymondee5e61d2001-02-09 09:10:35 +0000447 i = c.find('\n')
Bob Ippolitod82c3102006-05-22 15:59:12 +0000448
449 # We set i=size to break out of the loop under two
450 # conditions: 1) there's no newline, and the chunk is
451 # larger than size, or 2) there is a newline, but the
452 # resulting line would be longer than 'size'.
453 if (size <= i) or (i == -1 and len(c) > size):
454 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000455
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000456 if i >= 0 or c == '':
Bob Ippolitod82c3102006-05-22 15:59:12 +0000457 bufs.append(c[:i + 1]) # Add portion of last chunk
458 self._unread(c[i + 1:]) # Push back rest of chunk
459 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000460
Bob Ippolitob9759732006-05-22 15:22:46 +0000461 # Append chunk to list, decrease 'size',
462 bufs.append(c)
463 size = size - len(c)
464 readsize = min(size, readsize * 2)
Bob Ippolitod82c3102006-05-22 15:59:12 +0000465 if readsize > self.min_readsize:
466 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
467 return ''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000468
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000469
470def _test():
471 # Act like gzip; with -d, act like gunzip.
472 # The input file is not deleted, however, nor are any other gzip
473 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000474 args = sys.argv[1:]
475 decompress = args and args[0] == "-d"
476 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000477 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000478 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000479 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000480 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000481 if decompress:
482 if arg == "-":
483 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
484 g = sys.stdout
485 else:
486 if arg[-3:] != ".gz":
Walter Dörwald70a6b492004-02-12 17:35:32 +0000487 print "filename doesn't end in .gz:", repr(arg)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000488 continue
489 f = open(arg, "rb")
490 g = __builtin__.open(arg[:-3], "wb")
491 else:
492 if arg == "-":
493 f = sys.stdin
494 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
495 else:
496 f = __builtin__.open(arg, "rb")
497 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000498 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000499 chunk = f.read(1024)
500 if not chunk:
501 break
502 g.write(chunk)
503 if g is not sys.stdout:
504 g.close()
505 if f is not sys.stdin:
506 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000507
508if __name__ == '__main__':
509 _test()