blob: 5fc7639f56fbdf82491625ae12d96548e965f738 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Antoine Pitrou673ddf92010-01-03 22:29:56 +000010import io
Guido van Rossum68de3791997-07-19 20:22:23 +000011import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +000012
Skip Montanaro2dd42762001-01-23 15:35:05 +000013__all__ = ["GzipFile","open"]
14
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Guido van Rossum15262191997-04-30 16:04:57 +000024def read32(input):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +000025 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000026
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou673ddf92010-01-03 22:29:56 +000036class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000037 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000038 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000039
40 """
Guido van Rossum15262191997-04-30 16:04:57 +000041
Guido van Rossum68de3791997-07-19 20:22:23 +000042 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000043 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +000044
Tim Peters07e99cb2001-01-14 23:47:14 +000045 def __init__(self, filename=None, mode=None,
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000046 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000047 """Constructor for the GzipFile class.
48
49 At least one of fileobj and filename must be given a
50 non-trivial value.
51
52 The new class instance is based on fileobj, which can be a regular
53 file, a StringIO object, or any other object which simulates a file.
54 It defaults to None, in which case filename is opened to provide
55 a file object.
56
57 When fileobj is not None, the filename argument is only used to be
58 included in the gzip file header, which may includes the original
59 filename of the uncompressed file. It defaults to the filename of
60 fileobj, if discernible; otherwise, it defaults to the empty string,
61 and in this case the original filename is not included in the header.
62
63 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64 depending on whether the file will be read or written. The default
65 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66 Be aware that only the 'rb', 'ab', and 'wb' values should be used
67 for cross-platform portability.
68
69 The compresslevel argument is an integer from 1 to 9 controlling the
70 level of compression; 1 is fastest and produces the least compression,
71 and 9 is slowest and produces the most compression. The default is 9.
72
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +000073 The mtime argument is an optional numeric timestamp to be written
74 to the stream when compressing. All gzip compressed streams
75 are required to contain a timestamp. If omitted or None, the
76 current time is used. This module ignores the timestamp when
77 decompressing; however, some programs, such as gunzip, make use
78 of it. The format of the timestamp is the same as that of the
79 return value of time.time() and of the st_mtime member of the
80 object returned by os.stat().
81
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000082 """
83
Skip Montanaro12424bc2002-05-23 01:43:05 +000084 # guarantee the file is opened in binary mode on platforms
85 # that care about that sort of thing
86 if mode and 'b' not in mode:
87 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000088 if fileobj is None:
Fred Drake9bb76d11999-04-05 18:33:40 +000089 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +000090 if filename is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000091 if hasattr(fileobj, 'name'): filename = fileobj.name
92 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +000093 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000094 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +000095 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +000096
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000097 if mode[0:1] == 'r':
98 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +000099 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000100 self._new_member = True
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000101 # Buffer data read from gzip file. extrastart is offset in
102 # stream where buffer starts. extrasize is number of
103 # bytes remaining in buffer from current stream position.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000104 self.extrabuf = ""
105 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000106 self.extrastart = 0
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000107 self.name = filename
Bob Ippolitod82c3102006-05-22 15:59:12 +0000108 # Starts small, scales exponentially
109 self.min_readsize = 100
Guido van Rossum15262191997-04-30 16:04:57 +0000110
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000111 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000112 self.mode = WRITE
113 self._init_write(filename)
114 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000115 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000116 -zlib.MAX_WBITS,
117 zlib.DEF_MEM_LEVEL,
118 0)
119 else:
Martin v. Löwisdb044892002-03-11 06:46:52 +0000120 raise IOError, "Mode " + mode + " not supported"
Guido van Rossum15262191997-04-30 16:04:57 +0000121
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000122 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000123 self.offset = 0
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000124 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000125
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000126 if self.mode == WRITE:
127 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000128
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000129 @property
130 def filename(self):
131 import warnings
Philip Jenveyd846f1d2009-05-08 02:28:39 +0000132 warnings.warn("use the name attribute", DeprecationWarning, 2)
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000133 if self.mode == WRITE and self.name[-3:] != ".gz":
134 return self.name + ".gz"
135 return self.name
136
Guido van Rossum15262191997-04-30 16:04:57 +0000137 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000138 s = repr(self.fileobj)
139 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000140
141 def _init_write(self, filename):
Lars Gustäbel5b1a7852007-02-13 16:09:24 +0000142 self.name = filename
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000143 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000144 self.size = 0
145 self.writebuf = []
146 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000147
148 def _write_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000149 self.fileobj.write('\037\213') # magic header
150 self.fileobj.write('\010') # compression method
Lars Gustäbel8c06ccc2009-10-29 09:15:00 +0000151 fname = os.path.basename(self.name)
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000152 if fname.endswith(".gz"):
153 fname = fname[:-3]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000154 flags = 0
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000155 if fname:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000156 flags = FNAME
157 self.fileobj.write(chr(flags))
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000158 mtime = self.mtime
159 if mtime is None:
160 mtime = time.time()
161 write32u(self.fileobj, long(mtime))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000162 self.fileobj.write('\002')
163 self.fileobj.write('\377')
Lars Gustäbelf19c1b52007-02-13 16:24:00 +0000164 if fname:
165 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000166
167 def _init_read(self):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000168 self.crc = zlib.crc32("") & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000169 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000170
171 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 magic = self.fileobj.read(2)
173 if magic != '\037\213':
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000174 raise IOError, 'Not a gzipped file'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000175 method = ord( self.fileobj.read(1) )
176 if method != 8:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000177 raise IOError, 'Unknown compression method'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000178 flag = ord( self.fileobj.read(1) )
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000179 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000180 # extraflag = self.fileobj.read(1)
181 # os = self.fileobj.read(1)
Antoine Pitrouf0d2c3f2009-01-04 21:29:23 +0000182 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000183
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000184 if flag & FEXTRA:
185 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000186 xlen = ord(self.fileobj.read(1))
187 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000188 self.fileobj.read(xlen)
189 if flag & FNAME:
190 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000191 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000192 s = self.fileobj.read(1)
193 if not s or s=='\000':
194 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 if flag & FCOMMENT:
196 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000197 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000198 s = self.fileobj.read(1)
199 if not s or s=='\000':
200 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000201 if flag & FHCRC:
202 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000203
Guido van Rossum15262191997-04-30 16:04:57 +0000204 def write(self,data):
Martin v. Löwisdb044892002-03-11 06:46:52 +0000205 if self.mode != WRITE:
206 import errno
207 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000208
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000209 if self.fileobj is None:
210 raise ValueError, "write() on closed GzipFile object"
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000211
212 # Convert data type if called by io.BufferedWriter.
213 if isinstance(data, memoryview):
214 data = data.tobytes()
215
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000216 if len(data) > 0:
217 self.size = self.size + len(data)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000218 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000219 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000220 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000221
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000222 return len(data)
223
Guido van Rossum56068012000-02-02 16:51:06 +0000224 def read(self, size=-1):
Martin v. Löwisdb044892002-03-11 06:46:52 +0000225 if self.mode != READ:
226 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000227 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000228
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000229 if self.extrasize <= 0 and self.fileobj is None:
230 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000231
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000232 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000233 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000234 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000235 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000236 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000237 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000238 except EOFError:
239 size = self.extrasize
240 else: # just get some more of it
241 try:
242 while size > self.extrasize:
243 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000244 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000245 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000246 if size > self.extrasize:
247 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000248
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000249 offset = self.offset - self.extrastart
250 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000251 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000252
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000253 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000255
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000256 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000257 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000258 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000259
260 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000261 if self.fileobj is None:
262 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000263
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000264 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000265 # If the _new_member flag is set, we have to
266 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000267 #
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000268 # First, check if we're at the end of the file;
269 # if so, it's time to stop; no more members to read.
270 pos = self.fileobj.tell() # Save current position
271 self.fileobj.seek(0, 2) # Seek to end of file
272 if pos == self.fileobj.tell():
Andrew M. Kuchling2d813e51999-09-06 16:34:51 +0000273 raise EOFError, "Reached EOF"
Tim Peters07e99cb2001-01-14 23:47:14 +0000274 else:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000275 self.fileobj.seek( pos ) # Return to original position
Tim Peters07e99cb2001-01-14 23:47:14 +0000276
277 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000278 self._read_gzip_header()
279 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000280 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000281
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000282 # Read a chunk of data from the file
283 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000284
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000285 # If the EOF has been reached, flush the decompression object
286 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000287
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000288 if buf == "":
289 uncompress = self.decompress.flush()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000290 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000291 self._add_read_data( uncompress )
292 raise EOFError, 'Reached EOF'
Tim Peters07e99cb2001-01-14 23:47:14 +0000293
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000294 uncompress = self.decompress.decompress(buf)
295 self._add_read_data( uncompress )
296
297 if self.decompress.unused_data != "":
298 # Ending case: we've come to the end of a member in the file,
299 # so seek back to the start of the unused data, finish up
300 # this member, and read a new gzip header.
301 # (The number of bytes to seek back is the length of the unused
302 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
303 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
304
305 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000306 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000307 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000308 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000309
310 def _add_read_data(self, data):
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000311 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000312 offset = self.offset - self.extrastart
313 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000314 self.extrasize = self.extrasize + len(data)
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000315 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000316 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000317
318 def _read_eof(self):
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000319 # We've read to the end of the file, so we have to rewind in order
Tim Peters07e99cb2001-01-14 23:47:14 +0000320 # to reread the 8 bytes containing the CRC and the file size.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000321 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000322 # uncompressed data matches the stored values. Note that the size
323 # stored is the true file size mod 2**32.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000324 self.fileobj.seek(-8, 1)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000325 crc32 = read32(self.fileobj)
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000326 isize = read32(self.fileobj) # may exceed 2GB
327 if crc32 != self.crc:
328 raise IOError("CRC check failed %s != %s" % (hex(crc32),
329 hex(self.crc)))
Gregory P. Smithac830e92008-03-23 23:43:02 +0000330 elif isize != (self.size & 0xffffffffL):
Andrew M. Kuchling64edd6a2003-02-05 21:35:07 +0000331 raise IOError, "Incorrect length of data produced"
Tim Peters07e99cb2001-01-14 23:47:14 +0000332
Antoine Pitrou5a9112c2010-01-13 14:32:10 +0000333 # Gzip files can be padded with zeroes and still have archives.
334 # Consume all zero bytes and set the file position to the first
335 # non-zero byte. See http://www.gzip.org/#faq8
336 c = "\x00"
337 while c == "\x00":
338 c = self.fileobj.read(1)
339 if c:
340 self.fileobj.seek(-1, 1)
341
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000342 @property
343 def closed(self):
344 return self.fileobj is None
345
Guido van Rossum15262191997-04-30 16:04:57 +0000346 def close(self):
Georg Brandle08e3d02008-05-25 08:07:37 +0000347 if self.fileobj is None:
348 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000349 if self.mode == WRITE:
350 self.fileobj.write(self.compress.flush())
Gregory P. Smith79b4ba82008-03-23 21:04:43 +0000351 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000352 # self.size may exceed 2GB, or even 4GB
Gregory P. Smithdd102842008-03-23 23:45:12 +0000353 write32u(self.fileobj, self.size & 0xffffffffL)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000354 self.fileobj = None
355 elif self.mode == READ:
356 self.fileobj = None
357 if self.myfileobj:
358 self.myfileobj.close()
359 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000360
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000361 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
362 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000363 # Ensure the compressor's buffer is flushed
364 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinson3b0b4ff2010-05-04 18:45:27 +0000365 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000366
Tim Peters5cfb05e2004-07-27 21:02:02 +0000367 def fileno(self):
368 """Invoke the underlying file object's fileno() method.
369
370 This will raise AttributeError if the underlying file object
371 doesn't support fileno().
372 """
373 return self.fileobj.fileno()
374
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000375 def rewind(self):
376 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000377 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000378 if self.mode != READ:
379 raise IOError("Can't rewind in write mode")
380 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000381 self._new_member = True
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000382 self.extrabuf = ""
383 self.extrasize = 0
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000384 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000385 self.offset = 0
386
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000387 def readable(self):
388 return self.mode == READ
389
390 def writable(self):
391 return self.mode == WRITE
392
393 def seekable(self):
394 return True
395
Martin v. Löwis065f0c82006-11-12 10:41:39 +0000396 def seek(self, offset, whence=0):
397 if whence:
398 if whence == 1:
399 offset = self.offset + offset
400 else:
401 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000402 if self.mode == WRITE:
403 if offset < self.offset:
404 raise IOError('Negative seek in write mode')
405 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000406 for i in range(count // 1024):
407 self.write(1024 * '\0')
408 self.write((count % 1024) * '\0')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000409 elif self.mode == READ:
410 if offset < self.offset:
411 # for negative seek, rewind and do positive seek
412 self.rewind()
413 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000414 for i in range(count // 1024):
415 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000416 self.read(count % 1024)
417
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000418 return self.offset
419
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000420 def readline(self, size=-1):
Bob Ippolitod82c3102006-05-22 15:59:12 +0000421 if size < 0:
Antoine Pitrou673ddf92010-01-03 22:29:56 +0000422 # Shortcut common case - newline found in buffer.
423 offset = self.offset - self.extrastart
424 i = self.extrabuf.find('\n', offset) + 1
425 if i > 0:
426 self.extrasize -= i - offset
427 self.offset += i - offset
428 return self.extrabuf[offset: i]
429
Bob Ippolitod82c3102006-05-22 15:59:12 +0000430 size = sys.maxint
431 readsize = self.min_readsize
432 else:
433 readsize = size
Bob Ippolitob9759732006-05-22 15:22:46 +0000434 bufs = []
Bob Ippolitod82c3102006-05-22 15:59:12 +0000435 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000436 c = self.read(readsize)
Eric S. Raymondee5e61d2001-02-09 09:10:35 +0000437 i = c.find('\n')
Bob Ippolitod82c3102006-05-22 15:59:12 +0000438
439 # We set i=size to break out of the loop under two
440 # conditions: 1) there's no newline, and the chunk is
441 # larger than size, or 2) there is a newline, but the
442 # resulting line would be longer than 'size'.
443 if (size <= i) or (i == -1 and len(c) > size):
444 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000445
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000446 if i >= 0 or c == '':
Bob Ippolitod82c3102006-05-22 15:59:12 +0000447 bufs.append(c[:i + 1]) # Add portion of last chunk
448 self._unread(c[i + 1:]) # Push back rest of chunk
449 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000450
Bob Ippolitob9759732006-05-22 15:22:46 +0000451 # Append chunk to list, decrease 'size',
452 bufs.append(c)
453 size = size - len(c)
454 readsize = min(size, readsize * 2)
Bob Ippolitod82c3102006-05-22 15:59:12 +0000455 if readsize > self.min_readsize:
456 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
457 return ''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000458
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000459
460def _test():
461 # Act like gzip; with -d, act like gunzip.
462 # The input file is not deleted, however, nor are any other gzip
463 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000464 args = sys.argv[1:]
465 decompress = args and args[0] == "-d"
466 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000467 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000468 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000469 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000470 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000471 if decompress:
472 if arg == "-":
473 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
474 g = sys.stdout
475 else:
476 if arg[-3:] != ".gz":
Walter Dörwald70a6b492004-02-12 17:35:32 +0000477 print "filename doesn't end in .gz:", repr(arg)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000478 continue
479 f = open(arg, "rb")
480 g = __builtin__.open(arg[:-3], "wb")
481 else:
482 if arg == "-":
483 f = sys.stdin
484 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
485 else:
486 f = __builtin__.open(arg, "rb")
487 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000488 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000489 chunk = f.read(1024)
490 if not chunk:
491 break
492 g.write(chunk)
493 if g is not sys.stdout:
494 g.close()
495 if f is not sys.stdin:
496 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000497
498if __name__ == '__main__':
499 _test()