blob: 83311cc0deb800a76f44a82dd07c83a91fcba11b [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Tim Petersfb0ea522002-11-04 19:50:11 +000019def U32(i):
20 """Return i as an unsigned integer, assuming it fits in 32 bits.
Tim Petersfb0ea522002-11-04 19:50:11 +000021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
Guido van Rossume2a383d2007-01-15 16:59:06 +000024 i += 1 << 32
Tim Petersfb0ea522002-11-04 19:50:11 +000025 return i
26
Tim Peters9288f952002-11-05 20:38:55 +000027def LOWU32(i):
Christian Heimesfe337bf2008-03-23 21:54:12 +000028 """Return the low-order 32 bits, as a non-negative int"""
Guido van Rossume2a383d2007-01-15 16:59:06 +000029 return i & 0xFFFFFFFF
Tim Peters9288f952002-11-05 20:38:55 +000030
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000031def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000032 # The L format writes the bit pattern correctly whether signed
33 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000034 output.write(struct.pack("<L", value))
35
Guido van Rossum15262191997-04-30 16:04:57 +000036def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000037 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000038
Fred Drakefa1591c1999-04-05 18:37:59 +000039def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000040 """Shorthand for GzipFile(filename, mode, compresslevel).
41
42 The filename argument is required; mode defaults to 'rb'
43 and compresslevel defaults to 9.
44
45 """
Guido van Rossum15262191997-04-30 16:04:57 +000046 return GzipFile(filename, mode, compresslevel)
47
Antoine Pitroub1f88352010-01-03 22:37:40 +000048class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000049 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000050 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000051
52 """
Guido van Rossum15262191997-04-30 16:04:57 +000053
Guido van Rossum68de3791997-07-19 20:22:23 +000054 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000055 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +000056
Tim Peters07e99cb2001-01-14 23:47:14 +000057 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +000058 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000059 """Constructor for the GzipFile class.
60
61 At least one of fileobj and filename must be given a
62 non-trivial value.
63
64 The new class instance is based on fileobj, which can be a regular
65 file, a StringIO object, or any other object which simulates a file.
66 It defaults to None, in which case filename is opened to provide
67 a file object.
68
69 When fileobj is not None, the filename argument is only used to be
70 included in the gzip file header, which may includes the original
71 filename of the uncompressed file. It defaults to the filename of
72 fileobj, if discernible; otherwise, it defaults to the empty string,
73 and in this case the original filename is not included in the header.
74
75 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
76 depending on whether the file will be read or written. The default
77 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
78 Be aware that only the 'rb', 'ab', and 'wb' values should be used
79 for cross-platform portability.
80
81 The compresslevel argument is an integer from 1 to 9 controlling the
82 level of compression; 1 is fastest and produces the least compression,
83 and 9 is slowest and produces the most compression. The default is 9.
84
Antoine Pitrou42db3ef2009-01-04 21:37:59 +000085 The mtime argument is an optional numeric timestamp to be written
86 to the stream when compressing. All gzip compressed streams
87 are required to contain a timestamp. If omitted or None, the
88 current time is used. This module ignores the timestamp when
89 decompressing; however, some programs, such as gunzip, make use
90 of it. The format of the timestamp is the same as that of the
91 return value of time.time() and of the st_mtime member of the
92 object returned by os.stat().
93
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000094 """
95
Skip Montanaro12424bc2002-05-23 01:43:05 +000096 # guarantee the file is opened in binary mode on platforms
97 # that care about that sort of thing
98 if mode and 'b' not in mode:
99 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000100 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000101 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000102 if filename is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 if hasattr(fileobj, 'name'): filename = fileobj.name
104 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000105 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000106 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000107 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000108
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000109 if mode[0:1] == 'r':
110 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000111 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000112 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000113 # Buffer data read from gzip file. extrastart is offset in
114 # stream where buffer starts. extrasize is number of
115 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000116 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000117 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000118 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000119 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000120 # Starts small, scales exponentially
121 self.min_readsize = 100
Guido van Rossum15262191997-04-30 16:04:57 +0000122
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000123 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000124 self.mode = WRITE
125 self._init_write(filename)
126 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000127 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000128 -zlib.MAX_WBITS,
129 zlib.DEF_MEM_LEVEL,
130 0)
131 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000132 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000133
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000134 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000135 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000136 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000137
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000138 if self.mode == WRITE:
139 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000140
Thomas Wouterscf297e42007-02-23 15:07:44 +0000141 @property
142 def filename(self):
143 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000144 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000145 if self.mode == WRITE and self.name[-3:] != ".gz":
146 return self.name + ".gz"
147 return self.name
148
Guido van Rossum15262191997-04-30 16:04:57 +0000149 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000150 s = repr(self.fileobj)
151 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000152
153 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000154 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000155 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000156 self.size = 0
157 self.writebuf = []
158 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000159
160 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000161 self.fileobj.write(b'\037\213') # magic header
162 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000163 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000164 # RFC 1952 requires the FNAME field to be Latin-1. Do not
165 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000166 fname = os.path.basename(self.name)
167 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000168 if fname.endswith(b'.gz'):
169 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000170 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000171 fname = b''
172 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000173 if fname:
174 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000175 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000176 mtime = self.mtime
177 if mtime is None:
178 mtime = time.time()
179 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000180 self.fileobj.write(b'\002')
181 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000182 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000183 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000184
185 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000186 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000187 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000188
189 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000190 magic = self.fileobj.read(2)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000191 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000192 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000193 method = ord( self.fileobj.read(1) )
194 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000195 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000196 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000197 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000198 # extraflag = self.fileobj.read(1)
199 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000200 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000201
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000202 if flag & FEXTRA:
203 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000204 xlen = ord(self.fileobj.read(1))
205 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000206 self.fileobj.read(xlen)
207 if flag & FNAME:
208 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000209 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000210 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000211 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000212 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 if flag & FCOMMENT:
214 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000215 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000216 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000217 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000218 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000219 if flag & FHCRC:
220 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000221
Guido van Rossum15262191997-04-30 16:04:57 +0000222 def write(self,data):
Martin v. Löwisdb044892002-03-11 06:46:52 +0000223 if self.mode != WRITE:
224 import errno
225 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000226
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000227 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000228 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000229
230 # Convert data type if called by io.BufferedWriter.
231 if isinstance(data, memoryview):
232 data = data.tobytes()
233
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000234 if len(data) > 0:
235 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000236 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000237 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000238 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000239
Antoine Pitroub1f88352010-01-03 22:37:40 +0000240 return len(data)
241
Guido van Rossum56068012000-02-02 16:51:06 +0000242 def read(self, size=-1):
Martin v. Löwisdb044892002-03-11 06:46:52 +0000243 if self.mode != READ:
244 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000245 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000246
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000247 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000248 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000249
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000250 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000251 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000252 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000253 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000255 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 except EOFError:
257 size = self.extrasize
258 else: # just get some more of it
259 try:
260 while size > self.extrasize:
261 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000262 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000263 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000264 if size > self.extrasize:
265 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000266
Antoine Pitroub1f88352010-01-03 22:37:40 +0000267 offset = self.offset - self.extrastart
268 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000269 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000270
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000271 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000272 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000273
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000274 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000275 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000276 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000277
278 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000279 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000280 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000281
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000282 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000283 # If the _new_member flag is set, we have to
284 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000285 #
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000286 # First, check if we're at the end of the file;
287 # if so, it's time to stop; no more members to read.
288 pos = self.fileobj.tell() # Save current position
289 self.fileobj.seek(0, 2) # Seek to end of file
290 if pos == self.fileobj.tell():
Collin Winterce36ad82007-08-30 01:19:48 +0000291 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000292 else:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000293 self.fileobj.seek( pos ) # Return to original position
Tim Peters07e99cb2001-01-14 23:47:14 +0000294
295 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000296 self._read_gzip_header()
297 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000298 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000299
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000300 # Read a chunk of data from the file
301 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000302
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000303 # If the EOF has been reached, flush the decompression object
304 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000305
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000306 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000307 uncompress = self.decompress.flush()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000308 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000309 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000310 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000311
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000312 uncompress = self.decompress.decompress(buf)
313 self._add_read_data( uncompress )
314
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000315 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000316 # Ending case: we've come to the end of a member in the file,
317 # so seek back to the start of the unused data, finish up
318 # this member, and read a new gzip header.
319 # (The number of bytes to seek back is the length of the unused
320 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
321 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
322
323 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000324 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000325 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000326 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000327
328 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000329 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000330 offset = self.offset - self.extrastart
331 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000332 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000333 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000334 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000335
336 def _read_eof(self):
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000337 # We've read to the end of the file, so we have to rewind in order
Tim Peters07e99cb2001-01-14 23:47:14 +0000338 # to reread the 8 bytes containing the CRC and the file size.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000339 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000340 # uncompressed data matches the stored values. Note that the size
341 # stored is the true file size mod 2**32.
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000342 self.fileobj.seek(-8, 1)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000343 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000344 isize = read32(self.fileobj) # may exceed 2GB
345 if crc32 != self.crc:
346 raise IOError("CRC check failed %s != %s" % (hex(crc32),
347 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000348 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000349 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000350
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000351 # Gzip files can be padded with zeroes and still have archives.
352 # Consume all zero bytes and set the file position to the first
353 # non-zero byte. See http://www.gzip.org/#faq8
354 c = b"\x00"
355 while c == b"\x00":
356 c = self.fileobj.read(1)
357 if c:
358 self.fileobj.seek(-1, 1)
359
Antoine Pitroub1f88352010-01-03 22:37:40 +0000360 @property
361 def closed(self):
362 return self.fileobj is None
363
Guido van Rossum15262191997-04-30 16:04:57 +0000364 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000365 if self.fileobj is None:
366 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000367 if self.mode == WRITE:
368 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000369 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000370 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000371 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000372 self.fileobj = None
373 elif self.mode == READ:
374 self.fileobj = None
375 if self.myfileobj:
376 self.myfileobj.close()
377 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000378
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000379 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
380 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000381 # Ensure the compressor's buffer is flushed
382 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000383 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000384
Tim Peters5cfb05e2004-07-27 21:02:02 +0000385 def fileno(self):
386 """Invoke the underlying file object's fileno() method.
387
388 This will raise AttributeError if the underlying file object
389 doesn't support fileno().
390 """
391 return self.fileobj.fileno()
392
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000393 def rewind(self):
394 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000395 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000396 if self.mode != READ:
397 raise IOError("Can't rewind in write mode")
398 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000399 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000400 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000401 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000402 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000403 self.offset = 0
404
Antoine Pitroub1f88352010-01-03 22:37:40 +0000405 def readable(self):
406 return self.mode == READ
407
408 def writable(self):
409 return self.mode == WRITE
410
411 def seekable(self):
412 return True
413
Thomas Wouters89f507f2006-12-13 04:49:30 +0000414 def seek(self, offset, whence=0):
415 if whence:
416 if whence == 1:
417 offset = self.offset + offset
418 else:
419 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000420 if self.mode == WRITE:
421 if offset < self.offset:
422 raise IOError('Negative seek in write mode')
423 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000424 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000425 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000426 self.write(chunk)
427 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000428 elif self.mode == READ:
429 if offset < self.offset:
430 # for negative seek, rewind and do positive seek
431 self.rewind()
432 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000433 for i in range(count // 1024):
434 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000435 self.read(count % 1024)
436
Antoine Pitroub1f88352010-01-03 22:37:40 +0000437 return self.offset
438
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000439 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000440 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000441 # Shortcut common case - newline found in buffer.
442 offset = self.offset - self.extrastart
443 i = self.extrabuf.find(b'\n', offset) + 1
444 if i > 0:
445 self.extrasize -= i - offset
446 self.offset += i - offset
447 return self.extrabuf[offset: i]
448
Christian Heimesa37d4c62007-12-04 23:02:19 +0000449 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450 readsize = self.min_readsize
451 else:
452 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000453 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000454 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000455 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000456 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000457
458 # We set i=size to break out of the loop under two
459 # conditions: 1) there's no newline, and the chunk is
460 # larger than size, or 2) there is a newline, but the
461 # resulting line would be longer than 'size'.
462 if (size <= i) or (i == -1 and len(c) > size):
463 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000464
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000465 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000466 bufs.append(c[:i + 1]) # Add portion of last chunk
467 self._unread(c[i + 1:]) # Push back rest of chunk
468 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000469
470 # Append chunk to list, decrease 'size',
471 bufs.append(c)
472 size = size - len(c)
473 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000474 if readsize > self.min_readsize:
475 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000476 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000477
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000478
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000479def compress(data, compresslevel=9):
480 """Compress data in one shot and return the compressed string.
481 Optional argument is the compression level, in range of 1-9.
482 """
483 buf = io.BytesIO()
484 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
485 f.write(data)
486 return buf.getvalue()
487
488def decompress(data):
489 """Decompress a gzip compressed string in one shot.
490 Return the decompressed string.
491 """
492 with GzipFile(fileobj=io.BytesIO(data)) as f:
493 return f.read()
494
495
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000496def _test():
497 # Act like gzip; with -d, act like gunzip.
498 # The input file is not deleted, however, nor are any other gzip
499 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000500 args = sys.argv[1:]
501 decompress = args and args[0] == "-d"
502 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000503 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000504 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000505 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000506 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000507 if decompress:
508 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000509 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
510 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000511 else:
512 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000513 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000514 continue
515 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000516 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000517 else:
518 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000519 f = sys.stdin.buffer
520 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000521 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000522 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000523 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000524 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000525 chunk = f.read(1024)
526 if not chunk:
527 break
528 g.write(chunk)
529 if g is not sys.stdout:
530 g.close()
531 if f is not sys.stdin:
532 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000533
534if __name__ == '__main__':
535 _test()