blob: e6b81935051efee166e5b73919070cb524981654 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Guido van Rossum15262191997-04-30 16:04:57 +000024def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000025 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000026
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou7b969842010-09-23 16:22:51 +000036class _PaddedFile:
37 """Minimal read-only file object that prepends a string to the contents
38 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
39 essential functionality."""
40
41 def __init__(self, f, prepend=b''):
42 self._buffer = prepend
43 self._length = len(prepend)
44 self.file = f
45 self._read = 0
46
47 def read(self, size):
48 if self._read is None:
49 return self.file.read(size)
50 if self._read + size <= self._length:
51 read = self._read
52 self._read += size
53 return self._buffer[read:self._read]
54 else:
55 read = self._read
56 self._read = None
57 return self._buffer[read:] + \
58 self.file.read(size-self._length+read)
59
60 def prepend(self, prepend=b'', readprevious=False):
61 if self._read is None:
62 self._buffer = prepend
63 elif readprevious and len(prepend) <= self._read:
64 self._read -= len(prepend)
65 return
66 else:
67 self._buffer = self._buffer[read:] + prepend
68 self._length = len(self._buffer)
69 self._read = 0
70
71 def unused(self):
72 if self._read is None:
73 return b''
74 return self._buffer[self._read:]
75
76 def seek(self, offset, whence=0):
77 # This is only ever called with offset=whence=0
78 if whence == 1 and self._read is not None:
79 if 0 <= offset + self._read <= self._length:
80 self._read += offset
81 return
82 else:
83 offset += self._length - self._read
84 self._read = None
85 self._buffer = None
86 return self.file.seek(offset, whence)
87
88 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +000089 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +000090
91
Antoine Pitroub1f88352010-01-03 22:37:40 +000092class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000093 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000094 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000095
96 """
Guido van Rossum15262191997-04-30 16:04:57 +000097
Guido van Rossum68de3791997-07-19 20:22:23 +000098 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000099 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000100
Tim Peters07e99cb2001-01-14 23:47:14 +0000101 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000102 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000103 """Constructor for the GzipFile class.
104
105 At least one of fileobj and filename must be given a
106 non-trivial value.
107
108 The new class instance is based on fileobj, which can be a regular
109 file, a StringIO object, or any other object which simulates a file.
110 It defaults to None, in which case filename is opened to provide
111 a file object.
112
113 When fileobj is not None, the filename argument is only used to be
114 included in the gzip file header, which may includes the original
115 filename of the uncompressed file. It defaults to the filename of
116 fileobj, if discernible; otherwise, it defaults to the empty string,
117 and in this case the original filename is not included in the header.
118
119 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
120 depending on whether the file will be read or written. The default
121 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
122 Be aware that only the 'rb', 'ab', and 'wb' values should be used
123 for cross-platform portability.
124
125 The compresslevel argument is an integer from 1 to 9 controlling the
126 level of compression; 1 is fastest and produces the least compression,
127 and 9 is slowest and produces the most compression. The default is 9.
128
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000129 The mtime argument is an optional numeric timestamp to be written
130 to the stream when compressing. All gzip compressed streams
131 are required to contain a timestamp. If omitted or None, the
132 current time is used. This module ignores the timestamp when
133 decompressing; however, some programs, such as gunzip, make use
134 of it. The format of the timestamp is the same as that of the
135 return value of time.time() and of the st_mtime member of the
136 object returned by os.stat().
137
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000138 """
139
Skip Montanaro12424bc2002-05-23 01:43:05 +0000140 # guarantee the file is opened in binary mode on platforms
141 # that care about that sort of thing
142 if mode and 'b' not in mode:
143 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000144 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000145 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000146 if filename is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000147 if hasattr(fileobj, 'name'): filename = fileobj.name
148 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000149 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000150 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000151 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000152
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000153 if mode[0:1] == 'r':
154 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000155 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000156 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000157 # Buffer data read from gzip file. extrastart is offset in
158 # stream where buffer starts. extrasize is number of
159 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000160 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000161 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000162 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000163 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000164 # Starts small, scales exponentially
165 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000166 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000167
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000168 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000169 self.mode = WRITE
170 self._init_write(filename)
171 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000172 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000173 -zlib.MAX_WBITS,
174 zlib.DEF_MEM_LEVEL,
175 0)
176 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000177 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000178
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000179 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000180 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000181 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000182
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000183 if self.mode == WRITE:
184 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000185
Thomas Wouterscf297e42007-02-23 15:07:44 +0000186 @property
187 def filename(self):
188 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000189 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000190 if self.mode == WRITE and self.name[-3:] != ".gz":
191 return self.name + ".gz"
192 return self.name
193
Guido van Rossum15262191997-04-30 16:04:57 +0000194 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000195 fileobj = self.fileobj
196 if isinstance(fileobj, _PaddedFile):
197 fileobj = fileobj.file
198 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000199 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000200
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000201 def _check_closed(self):
202 """Raises a ValueError if the underlying file object has been closed.
203
204 """
205 if self.closed:
206 raise ValueError('I/O operation on closed file.')
207
Guido van Rossum15262191997-04-30 16:04:57 +0000208 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000209 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000210 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 self.size = 0
212 self.writebuf = []
213 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000214
215 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000216 self.fileobj.write(b'\037\213') # magic header
217 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000218 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000219 # RFC 1952 requires the FNAME field to be Latin-1. Do not
220 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000221 fname = os.path.basename(self.name)
222 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000223 if fname.endswith(b'.gz'):
224 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000225 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000226 fname = b''
227 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000228 if fname:
229 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000230 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000231 mtime = self.mtime
232 if mtime is None:
233 mtime = time.time()
234 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000235 self.fileobj.write(b'\002')
236 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000237 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000238 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000239
240 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000241 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000242 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000243
244 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000245 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000246 if magic == b'':
247 raise EOFError("Reached EOF")
248
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000249 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000250 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000251 method = ord( self.fileobj.read(1) )
252 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000253 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000255 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 # extraflag = self.fileobj.read(1)
257 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000258 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000259
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000260 if flag & FEXTRA:
261 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000262 xlen = ord(self.fileobj.read(1))
263 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000264 self.fileobj.read(xlen)
265 if flag & FNAME:
266 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000267 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000268 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000269 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000270 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000271 if flag & FCOMMENT:
272 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000273 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000274 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000275 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000276 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000277 if flag & FHCRC:
278 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000279
Antoine Pitrou7b969842010-09-23 16:22:51 +0000280 unused = self.fileobj.unused()
281 if unused:
282 uncompress = self.decompress.decompress(unused)
283 self._add_read_data(uncompress)
284
Guido van Rossum15262191997-04-30 16:04:57 +0000285 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000286 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000287 if self.mode != WRITE:
288 import errno
289 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000290
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000291 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000292 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000293
294 # Convert data type if called by io.BufferedWriter.
295 if isinstance(data, memoryview):
296 data = data.tobytes()
297
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000298 if len(data) > 0:
299 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000300 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000301 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000302 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000303
Antoine Pitroub1f88352010-01-03 22:37:40 +0000304 return len(data)
305
Guido van Rossum56068012000-02-02 16:51:06 +0000306 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000307 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000308 if self.mode != READ:
309 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000310 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000311
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000312 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000313 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000314
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000316 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000317 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000318 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000320 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000321 except EOFError:
322 size = self.extrasize
323 else: # just get some more of it
324 try:
325 while size > self.extrasize:
326 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000327 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000328 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000329 if size > self.extrasize:
330 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000331
Antoine Pitroub1f88352010-01-03 22:37:40 +0000332 offset = self.offset - self.extrastart
333 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000334 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000335
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000336 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000337 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000338
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200339 def read1(self, size=-1):
340 self._check_closed()
341 if self.mode != READ:
342 import errno
343 raise IOError(errno.EBADF, "read1() on write-only GzipFile object")
344
345 if self.extrasize <= 0 and self.fileobj is None:
346 return b''
347
348 try:
349 self._read()
350 except EOFError:
351 pass
352 if size < 0 or size > self.extrasize:
353 size = self.extrasize
354
355 offset = self.offset - self.extrastart
356 chunk = self.extrabuf[offset: offset + size]
357 self.extrasize -= size
358 self.offset += size
359 return chunk
360
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000361 def peek(self, n):
362 if self.mode != READ:
363 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000364 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000365
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000366 # Do not return ridiculously small buffers, for one common idiom
367 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000368 if n < 100:
369 n = 100
370 if self.extrasize == 0:
371 if self.fileobj is None:
372 return b''
373 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000374 # 1024 is the same buffering heuristic used in read()
375 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000376 except EOFError:
377 pass
378 offset = self.offset - self.extrastart
379 remaining = self.extrasize
380 assert remaining == len(self.extrabuf) - offset
381 return self.extrabuf[offset:offset + n]
382
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000383 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000384 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000385 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000386
387 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000388 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000389 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000390
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000391 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000392 # If the _new_member flag is set, we have to
393 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000394 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000395 self._read_gzip_header()
396 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000397 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000398
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000399 # Read a chunk of data from the file
400 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000401
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000402 # If the EOF has been reached, flush the decompression object
403 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000404
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000405 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000406 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000407 # Prepend the already read bytes to the fileobj to they can be
408 # seen by _read_eof()
409 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000410 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000411 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000412 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000413
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000414 uncompress = self.decompress.decompress(buf)
415 self._add_read_data( uncompress )
416
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000417 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000418 # Ending case: we've come to the end of a member in the file,
419 # so seek back to the start of the unused data, finish up
420 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000421 # Prepend the already read bytes to the fileobj to they can be
422 # seen by _read_eof() and _read_gzip_header()
423 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000424 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000425 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000426 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000427 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000428
429 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000430 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000431 offset = self.offset - self.extrastart
432 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000433 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000434 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000435 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000436
437 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000438 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000439 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000440 # uncompressed data matches the stored values. Note that the size
441 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000442 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000443 isize = read32(self.fileobj) # may exceed 2GB
444 if crc32 != self.crc:
445 raise IOError("CRC check failed %s != %s" % (hex(crc32),
446 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000447 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000448 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000449
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000450 # Gzip files can be padded with zeroes and still have archives.
451 # Consume all zero bytes and set the file position to the first
452 # non-zero byte. See http://www.gzip.org/#faq8
453 c = b"\x00"
454 while c == b"\x00":
455 c = self.fileobj.read(1)
456 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000457 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000458
Antoine Pitroub1f88352010-01-03 22:37:40 +0000459 @property
460 def closed(self):
461 return self.fileobj is None
462
Guido van Rossum15262191997-04-30 16:04:57 +0000463 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000464 if self.fileobj is None:
465 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000466 if self.mode == WRITE:
467 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000468 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000469 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000470 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000471 self.fileobj = None
472 elif self.mode == READ:
473 self.fileobj = None
474 if self.myfileobj:
475 self.myfileobj.close()
476 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000477
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000478 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000479 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000480 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000481 # Ensure the compressor's buffer is flushed
482 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000483 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000484
Tim Peters5cfb05e2004-07-27 21:02:02 +0000485 def fileno(self):
486 """Invoke the underlying file object's fileno() method.
487
488 This will raise AttributeError if the underlying file object
489 doesn't support fileno().
490 """
491 return self.fileobj.fileno()
492
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000493 def rewind(self):
494 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000495 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000496 if self.mode != READ:
497 raise IOError("Can't rewind in write mode")
498 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000499 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000500 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000501 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000502 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000503 self.offset = 0
504
Antoine Pitroub1f88352010-01-03 22:37:40 +0000505 def readable(self):
506 return self.mode == READ
507
508 def writable(self):
509 return self.mode == WRITE
510
511 def seekable(self):
512 return True
513
Thomas Wouters89f507f2006-12-13 04:49:30 +0000514 def seek(self, offset, whence=0):
515 if whence:
516 if whence == 1:
517 offset = self.offset + offset
518 else:
519 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000520 if self.mode == WRITE:
521 if offset < self.offset:
522 raise IOError('Negative seek in write mode')
523 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000524 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000525 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000526 self.write(chunk)
527 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000528 elif self.mode == READ:
529 if offset < self.offset:
530 # for negative seek, rewind and do positive seek
531 self.rewind()
532 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000533 for i in range(count // 1024):
534 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000535 self.read(count % 1024)
536
Antoine Pitroub1f88352010-01-03 22:37:40 +0000537 return self.offset
538
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000539 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000541 # Shortcut common case - newline found in buffer.
542 offset = self.offset - self.extrastart
543 i = self.extrabuf.find(b'\n', offset) + 1
544 if i > 0:
545 self.extrasize -= i - offset
546 self.offset += i - offset
547 return self.extrabuf[offset: i]
548
Christian Heimesa37d4c62007-12-04 23:02:19 +0000549 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000550 readsize = self.min_readsize
551 else:
552 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000553 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000555 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000556 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557
558 # We set i=size to break out of the loop under two
559 # conditions: 1) there's no newline, and the chunk is
560 # larger than size, or 2) there is a newline, but the
561 # resulting line would be longer than 'size'.
562 if (size <= i) or (i == -1 and len(c) > size):
563 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000564
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000565 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000566 bufs.append(c[:i + 1]) # Add portion of last chunk
567 self._unread(c[i + 1:]) # Push back rest of chunk
568 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000569
570 # Append chunk to list, decrease 'size',
571 bufs.append(c)
572 size = size - len(c)
573 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000574 if readsize > self.min_readsize:
575 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000576 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000577
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000578
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000579def compress(data, compresslevel=9):
580 """Compress data in one shot and return the compressed string.
581 Optional argument is the compression level, in range of 1-9.
582 """
583 buf = io.BytesIO()
584 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
585 f.write(data)
586 return buf.getvalue()
587
588def decompress(data):
589 """Decompress a gzip compressed string in one shot.
590 Return the decompressed string.
591 """
592 with GzipFile(fileobj=io.BytesIO(data)) as f:
593 return f.read()
594
595
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000596def _test():
597 # Act like gzip; with -d, act like gunzip.
598 # The input file is not deleted, however, nor are any other gzip
599 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000600 args = sys.argv[1:]
601 decompress = args and args[0] == "-d"
602 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000603 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000604 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000605 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000606 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000607 if decompress:
608 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000609 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
610 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000611 else:
612 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000613 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000614 continue
615 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000616 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000617 else:
618 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000619 f = sys.stdin.buffer
620 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000621 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000622 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000623 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000624 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000625 chunk = f.read(1024)
626 if not chunk:
627 break
628 g.write(chunk)
629 if g is not sys.stdout:
630 g.close()
631 if f is not sys.stdin:
632 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000633
634if __name__ == '__main__':
635 _test()