blob: 8fb1ed06c9d043e285859cbf097578ef3f74b107 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Tim Petersfb0ea522002-11-04 19:50:11 +000019def U32(i):
20 """Return i as an unsigned integer, assuming it fits in 32 bits.
Tim Petersfb0ea522002-11-04 19:50:11 +000021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
Guido van Rossume2a383d2007-01-15 16:59:06 +000024 i += 1 << 32
Tim Petersfb0ea522002-11-04 19:50:11 +000025 return i
26
Tim Peters9288f952002-11-05 20:38:55 +000027def LOWU32(i):
Christian Heimesfe337bf2008-03-23 21:54:12 +000028 """Return the low-order 32 bits, as a non-negative int"""
Guido van Rossume2a383d2007-01-15 16:59:06 +000029 return i & 0xFFFFFFFF
Tim Peters9288f952002-11-05 20:38:55 +000030
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000031def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000032 # The L format writes the bit pattern correctly whether signed
33 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000034 output.write(struct.pack("<L", value))
35
Fred Drakefa1591c1999-04-05 18:37:59 +000036def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000037 """Shorthand for GzipFile(filename, mode, compresslevel).
38
39 The filename argument is required; mode defaults to 'rb'
40 and compresslevel defaults to 9.
41
42 """
Guido van Rossum15262191997-04-30 16:04:57 +000043 return GzipFile(filename, mode, compresslevel)
44
Antoine Pitrou7b969842010-09-23 16:22:51 +000045class _PaddedFile:
46 """Minimal read-only file object that prepends a string to the contents
47 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
48 essential functionality."""
49
50 def __init__(self, f, prepend=b''):
51 self._buffer = prepend
52 self._length = len(prepend)
53 self.file = f
54 self._read = 0
55
56 def read(self, size):
57 if self._read is None:
58 return self.file.read(size)
59 if self._read + size <= self._length:
60 read = self._read
61 self._read += size
62 return self._buffer[read:self._read]
63 else:
64 read = self._read
65 self._read = None
66 return self._buffer[read:] + \
67 self.file.read(size-self._length+read)
68
69 def prepend(self, prepend=b'', readprevious=False):
70 if self._read is None:
71 self._buffer = prepend
72 elif readprevious and len(prepend) <= self._read:
73 self._read -= len(prepend)
74 return
75 else:
76 self._buffer = self._buffer[read:] + prepend
77 self._length = len(self._buffer)
78 self._read = 0
79
80 def unused(self):
81 if self._read is None:
82 return b''
83 return self._buffer[self._read:]
84
85 def seek(self, offset, whence=0):
86 # This is only ever called with offset=whence=0
87 if whence == 1 and self._read is not None:
88 if 0 <= offset + self._read <= self._length:
89 self._read += offset
90 return
91 else:
92 offset += self._length - self._read
93 self._read = None
94 self._buffer = None
95 return self.file.seek(offset, whence)
96
97 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +000098 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +000099
100
Antoine Pitroub1f88352010-01-03 22:37:40 +0000101class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000102 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000103 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000104
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200105 This class only supports opening files in binary mode. If you need to open a
106 compressed file in text mode, wrap your GzipFile with an io.TextIOWrapper.
107
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000108 """
Guido van Rossum15262191997-04-30 16:04:57 +0000109
Guido van Rossum68de3791997-07-19 20:22:23 +0000110 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000111 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000112
Tim Peters07e99cb2001-01-14 23:47:14 +0000113 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000114 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000115 """Constructor for the GzipFile class.
116
117 At least one of fileobj and filename must be given a
118 non-trivial value.
119
120 The new class instance is based on fileobj, which can be a regular
121 file, a StringIO object, or any other object which simulates a file.
122 It defaults to None, in which case filename is opened to provide
123 a file object.
124
125 When fileobj is not None, the filename argument is only used to be
126 included in the gzip file header, which may includes the original
127 filename of the uncompressed file. It defaults to the filename of
128 fileobj, if discernible; otherwise, it defaults to the empty string,
129 and in this case the original filename is not included in the header.
130
131 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
132 depending on whether the file will be read or written. The default
133 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200134 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
135 'wb', and 'a' and 'ab'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000136
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100137 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000138 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100139 and 9 is slowest and produces the most compression. 0 is no compression
140 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000141
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000142 The mtime argument is an optional numeric timestamp to be written
143 to the stream when compressing. All gzip compressed streams
144 are required to contain a timestamp. If omitted or None, the
145 current time is used. This module ignores the timestamp when
146 decompressing; however, some programs, such as gunzip, make use
147 of it. The format of the timestamp is the same as that of the
148 return value of time.time() and of the st_mtime member of the
149 object returned by os.stat().
150
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000151 """
152
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200153 if mode and ('t' in mode or 'U' in mode):
154 raise IOError("Mode " + mode + " not supported")
Skip Montanaro12424bc2002-05-23 01:43:05 +0000155 if mode and 'b' not in mode:
156 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000157 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000158 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000159 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200160 filename = getattr(fileobj, 'name', '')
161 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200162 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000163 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000164 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000165 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000166
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000167 if mode[0:1] == 'r':
168 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000169 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000170 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000171 # Buffer data read from gzip file. extrastart is offset in
172 # stream where buffer starts. extrasize is number of
173 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000174 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000175 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000176 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000177 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000178 # Starts small, scales exponentially
179 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000180 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000181
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000182 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000183 self.mode = WRITE
184 self._init_write(filename)
185 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000186 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000187 -zlib.MAX_WBITS,
188 zlib.DEF_MEM_LEVEL,
189 0)
190 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000191 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000192
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000193 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000194 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000195 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000196
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000197 if self.mode == WRITE:
198 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000199
Thomas Wouterscf297e42007-02-23 15:07:44 +0000200 @property
201 def filename(self):
202 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000203 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000204 if self.mode == WRITE and self.name[-3:] != ".gz":
205 return self.name + ".gz"
206 return self.name
207
Guido van Rossum15262191997-04-30 16:04:57 +0000208 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000209 fileobj = self.fileobj
210 if isinstance(fileobj, _PaddedFile):
211 fileobj = fileobj.file
212 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000214
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000215 def _check_closed(self):
216 """Raises a ValueError if the underlying file object has been closed.
217
218 """
219 if self.closed:
220 raise ValueError('I/O operation on closed file.')
221
Guido van Rossum15262191997-04-30 16:04:57 +0000222 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000223 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000224 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000225 self.size = 0
226 self.writebuf = []
227 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000228
229 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000230 self.fileobj.write(b'\037\213') # magic header
231 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000232 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000233 # RFC 1952 requires the FNAME field to be Latin-1. Do not
234 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000235 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200236 if not isinstance(fname, bytes):
237 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000238 if fname.endswith(b'.gz'):
239 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000240 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000241 fname = b''
242 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000243 if fname:
244 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000245 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000246 mtime = self.mtime
247 if mtime is None:
248 mtime = time.time()
249 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000250 self.fileobj.write(b'\002')
251 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000252 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000253 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000254
255 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000256 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000257 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000258
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200259 def _read_exact(self, n):
260 data = self.fileobj.read(n)
261 while len(data) < n:
262 b = self.fileobj.read(n - len(data))
263 if not b:
264 raise EOFError("Compressed file ended before the "
265 "end-of-stream marker was reached")
266 data += b
267 return data
268
Guido van Rossum15262191997-04-30 16:04:57 +0000269 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000270 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000271 if magic == b'':
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200272 return False
Antoine Pitrou7b969842010-09-23 16:22:51 +0000273
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000274 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000275 raise IOError('Not a gzipped file')
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200276
277 method, flag, self.mtime = struct.unpack("<BBIxx", self._read_exact(8))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000278 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000279 raise IOError('Unknown compression method')
Guido van Rossum15262191997-04-30 16:04:57 +0000280
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000281 if flag & FEXTRA:
282 # Read & discard the extra field, if present
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200283 self._read_exact(struct.unpack("<H", self._read_exact(2)))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000284 if flag & FNAME:
285 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000286 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000287 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000288 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000289 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000290 if flag & FCOMMENT:
291 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000292 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000293 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000294 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000295 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000296 if flag & FHCRC:
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200297 self._read_exact(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000298
Antoine Pitrou7b969842010-09-23 16:22:51 +0000299 unused = self.fileobj.unused()
300 if unused:
301 uncompress = self.decompress.decompress(unused)
302 self._add_read_data(uncompress)
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200303 return True
Antoine Pitrou7b969842010-09-23 16:22:51 +0000304
Guido van Rossum15262191997-04-30 16:04:57 +0000305 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000306 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000307 if self.mode != WRITE:
308 import errno
309 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000310
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000311 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000312 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000313
314 # Convert data type if called by io.BufferedWriter.
315 if isinstance(data, memoryview):
316 data = data.tobytes()
317
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000318 if len(data) > 0:
319 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000320 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000321 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000322 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000323
Antoine Pitroub1f88352010-01-03 22:37:40 +0000324 return len(data)
325
Guido van Rossum56068012000-02-02 16:51:06 +0000326 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000327 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000328 if self.mode != READ:
329 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000330 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000331
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000332 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000333 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000334
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000335 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000336 if size < 0: # get the whole thing
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200337 while self._read(readsize):
338 readsize = min(self.max_read_chunk, readsize * 2)
339 size = self.extrasize
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000340 else: # just get some more of it
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200341 while size > self.extrasize:
342 if not self._read(readsize):
343 if size > self.extrasize:
344 size = self.extrasize
345 break
346 readsize = min(self.max_read_chunk, readsize * 2)
Tim Peters07e99cb2001-01-14 23:47:14 +0000347
Antoine Pitroub1f88352010-01-03 22:37:40 +0000348 offset = self.offset - self.extrastart
349 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000350 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000351
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000352 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000353 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000354
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000355 def peek(self, n):
356 if self.mode != READ:
357 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000358 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000359
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000360 # Do not return ridiculously small buffers, for one common idiom
361 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000362 if n < 100:
363 n = 100
364 if self.extrasize == 0:
365 if self.fileobj is None:
366 return b''
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200367 # Ensure that we don't return b"" if we haven't reached EOF.
368 # 1024 is the same buffering heuristic used in read()
369 while self.extrasize == 0 and self._read(max(n, 1024)):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000370 pass
371 offset = self.offset - self.extrastart
372 remaining = self.extrasize
373 assert remaining == len(self.extrabuf) - offset
374 return self.extrabuf[offset:offset + n]
375
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000376 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000377 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000378 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000379
380 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000381 if self.fileobj is None:
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200382 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000383
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000384 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000385 # If the _new_member flag is set, we have to
386 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000387 self._init_read()
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200388 if not self._read_gzip_header():
389 return False
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000390 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000391 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000392
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000393 # Read a chunk of data from the file
394 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000395
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000396 # If the EOF has been reached, flush the decompression object
397 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000398
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000399 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000400 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000401 # Prepend the already read bytes to the fileobj to they can be
402 # seen by _read_eof()
403 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000404 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000405 self._add_read_data( uncompress )
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200406 return False
Tim Peters07e99cb2001-01-14 23:47:14 +0000407
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000408 uncompress = self.decompress.decompress(buf)
409 self._add_read_data( uncompress )
410
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000411 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000412 # Ending case: we've come to the end of a member in the file,
413 # so seek back to the start of the unused data, finish up
414 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000415 # Prepend the already read bytes to the fileobj to they can be
416 # seen by _read_eof() and _read_gzip_header()
417 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000418 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000419 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000420 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000421 self._new_member = True
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200422 return True
Tim Peters07e99cb2001-01-14 23:47:14 +0000423
424 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000425 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000426 offset = self.offset - self.extrastart
427 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000428 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000429 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000430 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000431
432 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000433 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000434 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000435 # uncompressed data matches the stored values. Note that the size
436 # stored is the true file size mod 2**32.
Serhiy Storchaka7c3922f2013-01-22 17:01:59 +0200437 crc32, isize = struct.unpack("<II", self._read_exact(8))
Christian Heimesfe337bf2008-03-23 21:54:12 +0000438 if crc32 != self.crc:
439 raise IOError("CRC check failed %s != %s" % (hex(crc32),
440 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000441 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000442 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000443
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000444 # Gzip files can be padded with zeroes and still have archives.
445 # Consume all zero bytes and set the file position to the first
446 # non-zero byte. See http://www.gzip.org/#faq8
447 c = b"\x00"
448 while c == b"\x00":
449 c = self.fileobj.read(1)
450 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000451 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000452
Antoine Pitroub1f88352010-01-03 22:37:40 +0000453 @property
454 def closed(self):
455 return self.fileobj is None
456
Guido van Rossum15262191997-04-30 16:04:57 +0000457 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000458 if self.fileobj is None:
459 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000460 if self.mode == WRITE:
461 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000462 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000463 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000464 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000465 self.fileobj = None
466 elif self.mode == READ:
467 self.fileobj = None
468 if self.myfileobj:
469 self.myfileobj.close()
470 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000471
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000472 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000473 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000474 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000475 # Ensure the compressor's buffer is flushed
476 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000477 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000478
Tim Peters5cfb05e2004-07-27 21:02:02 +0000479 def fileno(self):
480 """Invoke the underlying file object's fileno() method.
481
482 This will raise AttributeError if the underlying file object
483 doesn't support fileno().
484 """
485 return self.fileobj.fileno()
486
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000487 def rewind(self):
488 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000489 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000490 if self.mode != READ:
491 raise IOError("Can't rewind in write mode")
492 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000493 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000494 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000495 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000496 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000497 self.offset = 0
498
Antoine Pitroub1f88352010-01-03 22:37:40 +0000499 def readable(self):
500 return self.mode == READ
501
502 def writable(self):
503 return self.mode == WRITE
504
505 def seekable(self):
506 return True
507
Thomas Wouters89f507f2006-12-13 04:49:30 +0000508 def seek(self, offset, whence=0):
509 if whence:
510 if whence == 1:
511 offset = self.offset + offset
512 else:
513 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000514 if self.mode == WRITE:
515 if offset < self.offset:
516 raise IOError('Negative seek in write mode')
517 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000518 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000519 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000520 self.write(chunk)
521 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000522 elif self.mode == READ:
523 if offset < self.offset:
524 # for negative seek, rewind and do positive seek
525 self.rewind()
526 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000527 for i in range(count // 1024):
528 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000529 self.read(count % 1024)
530
Antoine Pitroub1f88352010-01-03 22:37:40 +0000531 return self.offset
532
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000533 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000535 # Shortcut common case - newline found in buffer.
536 offset = self.offset - self.extrastart
537 i = self.extrabuf.find(b'\n', offset) + 1
538 if i > 0:
539 self.extrasize -= i - offset
540 self.offset += i - offset
541 return self.extrabuf[offset: i]
542
Christian Heimesa37d4c62007-12-04 23:02:19 +0000543 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544 readsize = self.min_readsize
545 else:
546 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000547 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000549 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000550 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
552 # We set i=size to break out of the loop under two
553 # conditions: 1) there's no newline, and the chunk is
554 # larger than size, or 2) there is a newline, but the
555 # resulting line would be longer than 'size'.
556 if (size <= i) or (i == -1 and len(c) > size):
557 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000558
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000559 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000560 bufs.append(c[:i + 1]) # Add portion of last chunk
561 self._unread(c[i + 1:]) # Push back rest of chunk
562 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000563
564 # Append chunk to list, decrease 'size',
565 bufs.append(c)
566 size = size - len(c)
567 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000568 if readsize > self.min_readsize:
569 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000570 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000571
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000572
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000573def compress(data, compresslevel=9):
574 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100575 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000576 """
577 buf = io.BytesIO()
578 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
579 f.write(data)
580 return buf.getvalue()
581
582def decompress(data):
583 """Decompress a gzip compressed string in one shot.
584 Return the decompressed string.
585 """
586 with GzipFile(fileobj=io.BytesIO(data)) as f:
587 return f.read()
588
589
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000590def _test():
591 # Act like gzip; with -d, act like gunzip.
592 # The input file is not deleted, however, nor are any other gzip
593 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000594 args = sys.argv[1:]
595 decompress = args and args[0] == "-d"
596 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000597 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000598 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000599 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000600 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000601 if decompress:
602 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000603 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
604 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000605 else:
606 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000607 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000608 continue
609 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000610 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000611 else:
612 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000613 f = sys.stdin.buffer
614 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000615 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000616 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000617 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000618 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000619 chunk = f.read(1024)
620 if not chunk:
621 break
622 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200623 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000624 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200625 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000626 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000627
628if __name__ == '__main__':
629 _test()