blob: 93dda4edd469228052cb6d42131efd1e902cd3b9 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Guido van Rossum15262191997-04-30 16:04:57 +000024def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000025 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000026
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou7b969842010-09-23 16:22:51 +000036class _PaddedFile:
37 """Minimal read-only file object that prepends a string to the contents
38 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
39 essential functionality."""
40
41 def __init__(self, f, prepend=b''):
42 self._buffer = prepend
43 self._length = len(prepend)
44 self.file = f
45 self._read = 0
46
47 def read(self, size):
48 if self._read is None:
49 return self.file.read(size)
50 if self._read + size <= self._length:
51 read = self._read
52 self._read += size
53 return self._buffer[read:self._read]
54 else:
55 read = self._read
56 self._read = None
57 return self._buffer[read:] + \
58 self.file.read(size-self._length+read)
59
60 def prepend(self, prepend=b'', readprevious=False):
61 if self._read is None:
62 self._buffer = prepend
63 elif readprevious and len(prepend) <= self._read:
64 self._read -= len(prepend)
65 return
66 else:
67 self._buffer = self._buffer[read:] + prepend
68 self._length = len(self._buffer)
69 self._read = 0
70
71 def unused(self):
72 if self._read is None:
73 return b''
74 return self._buffer[self._read:]
75
76 def seek(self, offset, whence=0):
77 # This is only ever called with offset=whence=0
78 if whence == 1 and self._read is not None:
79 if 0 <= offset + self._read <= self._length:
80 self._read += offset
81 return
82 else:
83 offset += self._length - self._read
84 self._read = None
85 self._buffer = None
86 return self.file.seek(offset, whence)
87
88 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +000089 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +000090
91
Antoine Pitroub1f88352010-01-03 22:37:40 +000092class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000093 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000094 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000095
96 """
Guido van Rossum15262191997-04-30 16:04:57 +000097
Guido van Rossum68de3791997-07-19 20:22:23 +000098 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +000099 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000100
Tim Peters07e99cb2001-01-14 23:47:14 +0000101 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000102 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000103 """Constructor for the GzipFile class.
104
105 At least one of fileobj and filename must be given a
106 non-trivial value.
107
108 The new class instance is based on fileobj, which can be a regular
109 file, a StringIO object, or any other object which simulates a file.
110 It defaults to None, in which case filename is opened to provide
111 a file object.
112
113 When fileobj is not None, the filename argument is only used to be
114 included in the gzip file header, which may includes the original
115 filename of the uncompressed file. It defaults to the filename of
116 fileobj, if discernible; otherwise, it defaults to the empty string,
117 and in this case the original filename is not included in the header.
118
119 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
120 depending on whether the file will be read or written. The default
121 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
122 Be aware that only the 'rb', 'ab', and 'wb' values should be used
123 for cross-platform portability.
124
125 The compresslevel argument is an integer from 1 to 9 controlling the
126 level of compression; 1 is fastest and produces the least compression,
127 and 9 is slowest and produces the most compression. The default is 9.
128
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000129 The mtime argument is an optional numeric timestamp to be written
130 to the stream when compressing. All gzip compressed streams
131 are required to contain a timestamp. If omitted or None, the
132 current time is used. This module ignores the timestamp when
133 decompressing; however, some programs, such as gunzip, make use
134 of it. The format of the timestamp is the same as that of the
135 return value of time.time() and of the st_mtime member of the
136 object returned by os.stat().
137
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000138 """
139
Skip Montanaro12424bc2002-05-23 01:43:05 +0000140 # guarantee the file is opened in binary mode on platforms
141 # that care about that sort of thing
142 if mode and 'b' not in mode:
143 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000144 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000145 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000146 if filename is None:
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200147 if hasattr(fileobj, 'name') and isinstance(fileobj.name, str):
148 filename = fileobj.name
149 else:
150 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000151 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000152 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000153 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000154
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000155 if mode[0:1] == 'r':
156 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000157 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000158 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000159 # Buffer data read from gzip file. extrastart is offset in
160 # stream where buffer starts. extrasize is number of
161 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000162 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000163 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000164 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000165 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000166 # Starts small, scales exponentially
167 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000168 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000169
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000170 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000171 self.mode = WRITE
172 self._init_write(filename)
173 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000174 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000175 -zlib.MAX_WBITS,
176 zlib.DEF_MEM_LEVEL,
177 0)
178 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000179 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000180
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000181 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000182 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000183 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000184
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 if self.mode == WRITE:
186 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000187
Thomas Wouterscf297e42007-02-23 15:07:44 +0000188 @property
189 def filename(self):
190 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000191 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000192 if self.mode == WRITE and self.name[-3:] != ".gz":
193 return self.name + ".gz"
194 return self.name
195
Guido van Rossum15262191997-04-30 16:04:57 +0000196 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000197 fileobj = self.fileobj
198 if isinstance(fileobj, _PaddedFile):
199 fileobj = fileobj.file
200 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000201 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000202
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000203 def _check_closed(self):
204 """Raises a ValueError if the underlying file object has been closed.
205
206 """
207 if self.closed:
208 raise ValueError('I/O operation on closed file.')
209
Guido van Rossum15262191997-04-30 16:04:57 +0000210 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000211 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000212 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 self.size = 0
214 self.writebuf = []
215 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000216
217 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000218 self.fileobj.write(b'\037\213') # magic header
219 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000220 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000221 # RFC 1952 requires the FNAME field to be Latin-1. Do not
222 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000223 fname = os.path.basename(self.name)
224 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000225 if fname.endswith(b'.gz'):
226 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000227 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000228 fname = b''
229 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000230 if fname:
231 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000232 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000233 mtime = self.mtime
234 if mtime is None:
235 mtime = time.time()
236 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000237 self.fileobj.write(b'\002')
238 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000239 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000240 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000241
242 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000243 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000244 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000245
246 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000247 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000248 if magic == b'':
249 raise EOFError("Reached EOF")
250
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000251 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000252 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000253 method = ord( self.fileobj.read(1) )
254 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000255 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000257 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000258 # extraflag = self.fileobj.read(1)
259 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000260 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000261
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000262 if flag & FEXTRA:
263 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000264 xlen = ord(self.fileobj.read(1))
265 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000266 self.fileobj.read(xlen)
267 if flag & FNAME:
268 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000269 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000270 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000271 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000272 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 if flag & FCOMMENT:
274 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000275 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000276 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000277 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000278 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000279 if flag & FHCRC:
280 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000281
Antoine Pitrou7b969842010-09-23 16:22:51 +0000282 unused = self.fileobj.unused()
283 if unused:
284 uncompress = self.decompress.decompress(unused)
285 self._add_read_data(uncompress)
286
Guido van Rossum15262191997-04-30 16:04:57 +0000287 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000288 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000289 if self.mode != WRITE:
290 import errno
291 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000292
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000293 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000294 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000295
296 # Convert data type if called by io.BufferedWriter.
297 if isinstance(data, memoryview):
298 data = data.tobytes()
299
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000300 if len(data) > 0:
301 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000302 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000304 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000305
Antoine Pitroub1f88352010-01-03 22:37:40 +0000306 return len(data)
307
Guido van Rossum56068012000-02-02 16:51:06 +0000308 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000309 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000310 if self.mode != READ:
311 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000312 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000313
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000314 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000315 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000316
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000317 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000318 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000320 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000321 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000322 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000323 except EOFError:
324 size = self.extrasize
325 else: # just get some more of it
326 try:
327 while size > self.extrasize:
328 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000329 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000330 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000331 if size > self.extrasize:
332 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000333
Antoine Pitroub1f88352010-01-03 22:37:40 +0000334 offset = self.offset - self.extrastart
335 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000336 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000337
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000338 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000339 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000340
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200341 def read1(self, size=-1):
342 self._check_closed()
343 if self.mode != READ:
344 import errno
345 raise IOError(errno.EBADF, "read1() on write-only GzipFile object")
346
347 if self.extrasize <= 0 and self.fileobj is None:
348 return b''
349
350 try:
351 self._read()
352 except EOFError:
353 pass
354 if size < 0 or size > self.extrasize:
355 size = self.extrasize
356
357 offset = self.offset - self.extrastart
358 chunk = self.extrabuf[offset: offset + size]
359 self.extrasize -= size
360 self.offset += size
361 return chunk
362
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000363 def peek(self, n):
364 if self.mode != READ:
365 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000366 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000367
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000368 # Do not return ridiculously small buffers, for one common idiom
369 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000370 if n < 100:
371 n = 100
372 if self.extrasize == 0:
373 if self.fileobj is None:
374 return b''
375 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000376 # 1024 is the same buffering heuristic used in read()
377 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000378 except EOFError:
379 pass
380 offset = self.offset - self.extrastart
381 remaining = self.extrasize
382 assert remaining == len(self.extrabuf) - offset
383 return self.extrabuf[offset:offset + n]
384
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000385 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000386 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000387 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000388
389 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000390 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000391 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000392
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000393 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000394 # If the _new_member flag is set, we have to
395 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000396 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000397 self._read_gzip_header()
398 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000399 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000400
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000401 # Read a chunk of data from the file
402 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000403
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000404 # If the EOF has been reached, flush the decompression object
405 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000406
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000407 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000408 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000409 # Prepend the already read bytes to the fileobj to they can be
410 # seen by _read_eof()
411 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000412 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000413 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000414 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000415
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000416 uncompress = self.decompress.decompress(buf)
417 self._add_read_data( uncompress )
418
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000419 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000420 # Ending case: we've come to the end of a member in the file,
421 # so seek back to the start of the unused data, finish up
422 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000423 # Prepend the already read bytes to the fileobj to they can be
424 # seen by _read_eof() and _read_gzip_header()
425 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000426 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000427 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000428 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000429 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000430
431 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000432 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000433 offset = self.offset - self.extrastart
434 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000435 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000436 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000437 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000438
439 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000440 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000441 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000442 # uncompressed data matches the stored values. Note that the size
443 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000444 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000445 isize = read32(self.fileobj) # may exceed 2GB
446 if crc32 != self.crc:
447 raise IOError("CRC check failed %s != %s" % (hex(crc32),
448 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000449 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000450 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000451
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000452 # Gzip files can be padded with zeroes and still have archives.
453 # Consume all zero bytes and set the file position to the first
454 # non-zero byte. See http://www.gzip.org/#faq8
455 c = b"\x00"
456 while c == b"\x00":
457 c = self.fileobj.read(1)
458 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000459 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000460
Antoine Pitroub1f88352010-01-03 22:37:40 +0000461 @property
462 def closed(self):
463 return self.fileobj is None
464
Guido van Rossum15262191997-04-30 16:04:57 +0000465 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000466 if self.fileobj is None:
467 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000468 if self.mode == WRITE:
469 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000470 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000471 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000472 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000473 self.fileobj = None
474 elif self.mode == READ:
475 self.fileobj = None
476 if self.myfileobj:
477 self.myfileobj.close()
478 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000479
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000480 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000481 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000482 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000483 # Ensure the compressor's buffer is flushed
484 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000485 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000486
Tim Peters5cfb05e2004-07-27 21:02:02 +0000487 def fileno(self):
488 """Invoke the underlying file object's fileno() method.
489
490 This will raise AttributeError if the underlying file object
491 doesn't support fileno().
492 """
493 return self.fileobj.fileno()
494
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000495 def rewind(self):
496 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000497 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000498 if self.mode != READ:
499 raise IOError("Can't rewind in write mode")
500 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000501 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000502 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000503 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000504 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000505 self.offset = 0
506
Antoine Pitroub1f88352010-01-03 22:37:40 +0000507 def readable(self):
508 return self.mode == READ
509
510 def writable(self):
511 return self.mode == WRITE
512
513 def seekable(self):
514 return True
515
Thomas Wouters89f507f2006-12-13 04:49:30 +0000516 def seek(self, offset, whence=0):
517 if whence:
518 if whence == 1:
519 offset = self.offset + offset
520 else:
521 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000522 if self.mode == WRITE:
523 if offset < self.offset:
524 raise IOError('Negative seek in write mode')
525 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000526 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000527 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000528 self.write(chunk)
529 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000530 elif self.mode == READ:
531 if offset < self.offset:
532 # for negative seek, rewind and do positive seek
533 self.rewind()
534 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000535 for i in range(count // 1024):
536 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000537 self.read(count % 1024)
538
Antoine Pitroub1f88352010-01-03 22:37:40 +0000539 return self.offset
540
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000541 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000543 # Shortcut common case - newline found in buffer.
544 offset = self.offset - self.extrastart
545 i = self.extrabuf.find(b'\n', offset) + 1
546 if i > 0:
547 self.extrasize -= i - offset
548 self.offset += i - offset
549 return self.extrabuf[offset: i]
550
Christian Heimesa37d4c62007-12-04 23:02:19 +0000551 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 readsize = self.min_readsize
553 else:
554 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000555 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000557 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000558 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000559
560 # We set i=size to break out of the loop under two
561 # conditions: 1) there's no newline, and the chunk is
562 # larger than size, or 2) there is a newline, but the
563 # resulting line would be longer than 'size'.
564 if (size <= i) or (i == -1 and len(c) > size):
565 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000566
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000567 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000568 bufs.append(c[:i + 1]) # Add portion of last chunk
569 self._unread(c[i + 1:]) # Push back rest of chunk
570 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000571
572 # Append chunk to list, decrease 'size',
573 bufs.append(c)
574 size = size - len(c)
575 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000576 if readsize > self.min_readsize:
577 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000578 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000579
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000580
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000581def compress(data, compresslevel=9):
582 """Compress data in one shot and return the compressed string.
583 Optional argument is the compression level, in range of 1-9.
584 """
585 buf = io.BytesIO()
586 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
587 f.write(data)
588 return buf.getvalue()
589
590def decompress(data):
591 """Decompress a gzip compressed string in one shot.
592 Return the decompressed string.
593 """
594 with GzipFile(fileobj=io.BytesIO(data)) as f:
595 return f.read()
596
597
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000598def _test():
599 # Act like gzip; with -d, act like gunzip.
600 # The input file is not deleted, however, nor are any other gzip
601 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000602 args = sys.argv[1:]
603 decompress = args and args[0] == "-d"
604 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000605 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000606 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000607 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000608 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000609 if decompress:
610 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000611 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
612 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000613 else:
614 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000615 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000616 continue
617 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000618 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000619 else:
620 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000621 f = sys.stdin.buffer
622 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000623 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000624 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000625 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000626 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000627 chunk = f.read(1024)
628 if not chunk:
629 break
630 g.write(chunk)
631 if g is not sys.stdout:
632 g.close()
633 if f is not sys.stdin:
634 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000635
636if __name__ == '__main__':
637 _test()