blob: e1b43a55988a5a952521c199860330647ec8addc [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Tim Petersfb0ea522002-11-04 19:50:11 +000019def U32(i):
20 """Return i as an unsigned integer, assuming it fits in 32 bits.
Tim Petersfb0ea522002-11-04 19:50:11 +000021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
Guido van Rossume2a383d2007-01-15 16:59:06 +000024 i += 1 << 32
Tim Petersfb0ea522002-11-04 19:50:11 +000025 return i
26
Tim Peters9288f952002-11-05 20:38:55 +000027def LOWU32(i):
Christian Heimesfe337bf2008-03-23 21:54:12 +000028 """Return the low-order 32 bits, as a non-negative int"""
Guido van Rossume2a383d2007-01-15 16:59:06 +000029 return i & 0xFFFFFFFF
Tim Peters9288f952002-11-05 20:38:55 +000030
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000031def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000032 # The L format writes the bit pattern correctly whether signed
33 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000034 output.write(struct.pack("<L", value))
35
Guido van Rossum15262191997-04-30 16:04:57 +000036def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000037 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000038
Fred Drakefa1591c1999-04-05 18:37:59 +000039def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000040 """Shorthand for GzipFile(filename, mode, compresslevel).
41
42 The filename argument is required; mode defaults to 'rb'
43 and compresslevel defaults to 9.
44
45 """
Guido van Rossum15262191997-04-30 16:04:57 +000046 return GzipFile(filename, mode, compresslevel)
47
Antoine Pitrou7b969842010-09-23 16:22:51 +000048class _PaddedFile:
49 """Minimal read-only file object that prepends a string to the contents
50 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
51 essential functionality."""
52
53 def __init__(self, f, prepend=b''):
54 self._buffer = prepend
55 self._length = len(prepend)
56 self.file = f
57 self._read = 0
58
59 def read(self, size):
60 if self._read is None:
61 return self.file.read(size)
62 if self._read + size <= self._length:
63 read = self._read
64 self._read += size
65 return self._buffer[read:self._read]
66 else:
67 read = self._read
68 self._read = None
69 return self._buffer[read:] + \
70 self.file.read(size-self._length+read)
71
72 def prepend(self, prepend=b'', readprevious=False):
73 if self._read is None:
74 self._buffer = prepend
75 elif readprevious and len(prepend) <= self._read:
76 self._read -= len(prepend)
77 return
78 else:
79 self._buffer = self._buffer[read:] + prepend
80 self._length = len(self._buffer)
81 self._read = 0
82
83 def unused(self):
84 if self._read is None:
85 return b''
86 return self._buffer[self._read:]
87
88 def seek(self, offset, whence=0):
89 # This is only ever called with offset=whence=0
90 if whence == 1 and self._read is not None:
91 if 0 <= offset + self._read <= self._length:
92 self._read += offset
93 return
94 else:
95 offset += self._length - self._read
96 self._read = None
97 self._buffer = None
98 return self.file.seek(offset, whence)
99
100 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +0000101 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102
103
Antoine Pitroub1f88352010-01-03 22:37:40 +0000104class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000105 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000106 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000107
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200108 This class only supports opening files in binary mode. If you need to open a
109 compressed file in text mode, wrap your GzipFile with an io.TextIOWrapper.
110
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000111 """
Guido van Rossum15262191997-04-30 16:04:57 +0000112
Guido van Rossum68de3791997-07-19 20:22:23 +0000113 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000114 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000115
Tim Peters07e99cb2001-01-14 23:47:14 +0000116 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000117 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000118 """Constructor for the GzipFile class.
119
120 At least one of fileobj and filename must be given a
121 non-trivial value.
122
123 The new class instance is based on fileobj, which can be a regular
124 file, a StringIO object, or any other object which simulates a file.
125 It defaults to None, in which case filename is opened to provide
126 a file object.
127
128 When fileobj is not None, the filename argument is only used to be
129 included in the gzip file header, which may includes the original
130 filename of the uncompressed file. It defaults to the filename of
131 fileobj, if discernible; otherwise, it defaults to the empty string,
132 and in this case the original filename is not included in the header.
133
134 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
135 depending on whether the file will be read or written. The default
136 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200137 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
138 'wb', and 'a' and 'ab'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000139
140 The compresslevel argument is an integer from 1 to 9 controlling the
141 level of compression; 1 is fastest and produces the least compression,
142 and 9 is slowest and produces the most compression. The default is 9.
143
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000144 The mtime argument is an optional numeric timestamp to be written
145 to the stream when compressing. All gzip compressed streams
146 are required to contain a timestamp. If omitted or None, the
147 current time is used. This module ignores the timestamp when
148 decompressing; however, some programs, such as gunzip, make use
149 of it. The format of the timestamp is the same as that of the
150 return value of time.time() and of the st_mtime member of the
151 object returned by os.stat().
152
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000153 """
154
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200155 if mode and ('t' in mode or 'U' in mode):
156 raise IOError("Mode " + mode + " not supported")
Skip Montanaro12424bc2002-05-23 01:43:05 +0000157 if mode and 'b' not in mode:
158 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000160 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000161 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200162 filename = getattr(fileobj, 'name', '')
163 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200164 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000165 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000166 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000167 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000168
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000169 if mode[0:1] == 'r':
170 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000171 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000172 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000173 # Buffer data read from gzip file. extrastart is offset in
174 # stream where buffer starts. extrasize is number of
175 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000176 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000177 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000178 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000179 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000180 # Starts small, scales exponentially
181 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000182 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000183
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000184 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 self.mode = WRITE
186 self._init_write(filename)
187 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000188 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000189 -zlib.MAX_WBITS,
190 zlib.DEF_MEM_LEVEL,
191 0)
192 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000193 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000194
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000196 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000197 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000198
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000199 if self.mode == WRITE:
200 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000201
Thomas Wouterscf297e42007-02-23 15:07:44 +0000202 @property
203 def filename(self):
204 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000205 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000206 if self.mode == WRITE and self.name[-3:] != ".gz":
207 return self.name + ".gz"
208 return self.name
209
Guido van Rossum15262191997-04-30 16:04:57 +0000210 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000211 fileobj = self.fileobj
212 if isinstance(fileobj, _PaddedFile):
213 fileobj = fileobj.file
214 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000215 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000216
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000217 def _check_closed(self):
218 """Raises a ValueError if the underlying file object has been closed.
219
220 """
221 if self.closed:
222 raise ValueError('I/O operation on closed file.')
223
Guido van Rossum15262191997-04-30 16:04:57 +0000224 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000225 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000226 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000227 self.size = 0
228 self.writebuf = []
229 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000230
231 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000232 self.fileobj.write(b'\037\213') # magic header
233 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000234 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000235 # RFC 1952 requires the FNAME field to be Latin-1. Do not
236 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000237 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200238 if not isinstance(fname, bytes):
239 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000240 if fname.endswith(b'.gz'):
241 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000242 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000243 fname = b''
244 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000245 if fname:
246 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000247 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000248 mtime = self.mtime
249 if mtime is None:
250 mtime = time.time()
251 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000252 self.fileobj.write(b'\002')
253 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000255 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000256
257 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000258 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000259 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000260
261 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000262 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000263 if magic == b'':
264 raise EOFError("Reached EOF")
265
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000266 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000267 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000268 method = ord( self.fileobj.read(1) )
269 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000270 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000271 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000272 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 # extraflag = self.fileobj.read(1)
274 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000275 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000276
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000277 if flag & FEXTRA:
278 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000279 xlen = ord(self.fileobj.read(1))
280 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000281 self.fileobj.read(xlen)
282 if flag & FNAME:
283 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000284 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000285 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000286 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000287 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000288 if flag & FCOMMENT:
289 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000290 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000291 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000292 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000293 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000294 if flag & FHCRC:
295 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000296
Antoine Pitrou7b969842010-09-23 16:22:51 +0000297 unused = self.fileobj.unused()
298 if unused:
299 uncompress = self.decompress.decompress(unused)
300 self._add_read_data(uncompress)
301
Guido van Rossum15262191997-04-30 16:04:57 +0000302 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000303 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000304 if self.mode != WRITE:
305 import errno
306 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000307
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000308 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000309 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000310
311 # Convert data type if called by io.BufferedWriter.
312 if isinstance(data, memoryview):
313 data = data.tobytes()
314
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 if len(data) > 0:
316 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000317 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000318 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000319 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000320
Antoine Pitroub1f88352010-01-03 22:37:40 +0000321 return len(data)
322
Guido van Rossum56068012000-02-02 16:51:06 +0000323 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000324 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000325 if self.mode != READ:
326 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000327 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000328
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000329 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000330 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000331
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000332 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000333 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000334 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000335 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000336 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000337 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000338 except EOFError:
339 size = self.extrasize
340 else: # just get some more of it
341 try:
342 while size > self.extrasize:
343 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000344 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000345 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000346 if size > self.extrasize:
347 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000348
Antoine Pitroub1f88352010-01-03 22:37:40 +0000349 offset = self.offset - self.extrastart
350 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000351 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000352
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000353 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000354 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000355
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000356 def peek(self, n):
357 if self.mode != READ:
358 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000359 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000360
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000361 # Do not return ridiculously small buffers, for one common idiom
362 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000363 if n < 100:
364 n = 100
365 if self.extrasize == 0:
366 if self.fileobj is None:
367 return b''
368 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000369 # 1024 is the same buffering heuristic used in read()
370 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000371 except EOFError:
372 pass
373 offset = self.offset - self.extrastart
374 remaining = self.extrasize
375 assert remaining == len(self.extrabuf) - offset
376 return self.extrabuf[offset:offset + n]
377
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000378 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000379 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000380 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000381
382 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000383 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000384 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000385
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000386 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000387 # If the _new_member flag is set, we have to
388 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000389 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000390 self._read_gzip_header()
391 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000392 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000393
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000394 # Read a chunk of data from the file
395 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000396
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000397 # If the EOF has been reached, flush the decompression object
398 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000399
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000400 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000401 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000402 # Prepend the already read bytes to the fileobj to they can be
403 # seen by _read_eof()
404 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000405 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000406 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000407 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000408
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000409 uncompress = self.decompress.decompress(buf)
410 self._add_read_data( uncompress )
411
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000412 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000413 # Ending case: we've come to the end of a member in the file,
414 # so seek back to the start of the unused data, finish up
415 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000416 # Prepend the already read bytes to the fileobj to they can be
417 # seen by _read_eof() and _read_gzip_header()
418 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000419 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000420 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000421 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000422 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000423
424 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000425 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000426 offset = self.offset - self.extrastart
427 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000428 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000429 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000430 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000431
432 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000433 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000434 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000435 # uncompressed data matches the stored values. Note that the size
436 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000437 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000438 isize = read32(self.fileobj) # may exceed 2GB
439 if crc32 != self.crc:
440 raise IOError("CRC check failed %s != %s" % (hex(crc32),
441 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000442 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000443 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000444
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000445 # Gzip files can be padded with zeroes and still have archives.
446 # Consume all zero bytes and set the file position to the first
447 # non-zero byte. See http://www.gzip.org/#faq8
448 c = b"\x00"
449 while c == b"\x00":
450 c = self.fileobj.read(1)
451 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000452 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000453
Antoine Pitroub1f88352010-01-03 22:37:40 +0000454 @property
455 def closed(self):
456 return self.fileobj is None
457
Guido van Rossum15262191997-04-30 16:04:57 +0000458 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000459 if self.fileobj is None:
460 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000461 if self.mode == WRITE:
462 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000463 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000464 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000465 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000466 self.fileobj = None
467 elif self.mode == READ:
468 self.fileobj = None
469 if self.myfileobj:
470 self.myfileobj.close()
471 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000472
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000473 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000474 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000475 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000476 # Ensure the compressor's buffer is flushed
477 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000478 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000479
Tim Peters5cfb05e2004-07-27 21:02:02 +0000480 def fileno(self):
481 """Invoke the underlying file object's fileno() method.
482
483 This will raise AttributeError if the underlying file object
484 doesn't support fileno().
485 """
486 return self.fileobj.fileno()
487
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000488 def rewind(self):
489 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000490 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000491 if self.mode != READ:
492 raise IOError("Can't rewind in write mode")
493 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000494 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000495 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000496 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000497 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000498 self.offset = 0
499
Antoine Pitroub1f88352010-01-03 22:37:40 +0000500 def readable(self):
501 return self.mode == READ
502
503 def writable(self):
504 return self.mode == WRITE
505
506 def seekable(self):
507 return True
508
Thomas Wouters89f507f2006-12-13 04:49:30 +0000509 def seek(self, offset, whence=0):
510 if whence:
511 if whence == 1:
512 offset = self.offset + offset
513 else:
514 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000515 if self.mode == WRITE:
516 if offset < self.offset:
517 raise IOError('Negative seek in write mode')
518 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000519 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000520 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000521 self.write(chunk)
522 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000523 elif self.mode == READ:
524 if offset < self.offset:
525 # for negative seek, rewind and do positive seek
526 self.rewind()
527 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000528 for i in range(count // 1024):
529 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000530 self.read(count % 1024)
531
Antoine Pitroub1f88352010-01-03 22:37:40 +0000532 return self.offset
533
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000534 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000536 # Shortcut common case - newline found in buffer.
537 offset = self.offset - self.extrastart
538 i = self.extrabuf.find(b'\n', offset) + 1
539 if i > 0:
540 self.extrasize -= i - offset
541 self.offset += i - offset
542 return self.extrabuf[offset: i]
543
Christian Heimesa37d4c62007-12-04 23:02:19 +0000544 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545 readsize = self.min_readsize
546 else:
547 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000548 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000550 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000551 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552
553 # We set i=size to break out of the loop under two
554 # conditions: 1) there's no newline, and the chunk is
555 # larger than size, or 2) there is a newline, but the
556 # resulting line would be longer than 'size'.
557 if (size <= i) or (i == -1 and len(c) > size):
558 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000559
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000560 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000561 bufs.append(c[:i + 1]) # Add portion of last chunk
562 self._unread(c[i + 1:]) # Push back rest of chunk
563 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000564
565 # Append chunk to list, decrease 'size',
566 bufs.append(c)
567 size = size - len(c)
568 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 if readsize > self.min_readsize:
570 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000571 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000572
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000573
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000574def compress(data, compresslevel=9):
575 """Compress data in one shot and return the compressed string.
576 Optional argument is the compression level, in range of 1-9.
577 """
578 buf = io.BytesIO()
579 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
580 f.write(data)
581 return buf.getvalue()
582
583def decompress(data):
584 """Decompress a gzip compressed string in one shot.
585 Return the decompressed string.
586 """
587 with GzipFile(fileobj=io.BytesIO(data)) as f:
588 return f.read()
589
590
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000591def _test():
592 # Act like gzip; with -d, act like gunzip.
593 # The input file is not deleted, however, nor are any other gzip
594 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000595 args = sys.argv[1:]
596 decompress = args and args[0] == "-d"
597 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000598 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000599 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000600 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000601 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000602 if decompress:
603 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000604 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
605 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000606 else:
607 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000608 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000609 continue
610 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000611 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000612 else:
613 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000614 f = sys.stdin.buffer
615 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000616 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000617 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000618 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000619 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000620 chunk = f.read(1024)
621 if not chunk:
622 break
623 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200624 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000625 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200626 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000627 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000628
629if __name__ == '__main__':
630 _test()