blob: 4462187116df35bb4629a659fcf9936a40b9fbc5 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Tim Petersfb0ea522002-11-04 19:50:11 +000019def U32(i):
20 """Return i as an unsigned integer, assuming it fits in 32 bits.
Tim Petersfb0ea522002-11-04 19:50:11 +000021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
Guido van Rossume2a383d2007-01-15 16:59:06 +000024 i += 1 << 32
Tim Petersfb0ea522002-11-04 19:50:11 +000025 return i
26
Tim Peters9288f952002-11-05 20:38:55 +000027def LOWU32(i):
Christian Heimesfe337bf2008-03-23 21:54:12 +000028 """Return the low-order 32 bits, as a non-negative int"""
Guido van Rossume2a383d2007-01-15 16:59:06 +000029 return i & 0xFFFFFFFF
Tim Peters9288f952002-11-05 20:38:55 +000030
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000031def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000032 # The L format writes the bit pattern correctly whether signed
33 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000034 output.write(struct.pack("<L", value))
35
Guido van Rossum15262191997-04-30 16:04:57 +000036def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000037 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000038
Fred Drakefa1591c1999-04-05 18:37:59 +000039def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000040 """Shorthand for GzipFile(filename, mode, compresslevel).
41
42 The filename argument is required; mode defaults to 'rb'
43 and compresslevel defaults to 9.
44
45 """
Guido van Rossum15262191997-04-30 16:04:57 +000046 return GzipFile(filename, mode, compresslevel)
47
Antoine Pitrou7b969842010-09-23 16:22:51 +000048class _PaddedFile:
49 """Minimal read-only file object that prepends a string to the contents
50 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
51 essential functionality."""
52
53 def __init__(self, f, prepend=b''):
54 self._buffer = prepend
55 self._length = len(prepend)
56 self.file = f
57 self._read = 0
58
59 def read(self, size):
60 if self._read is None:
61 return self.file.read(size)
62 if self._read + size <= self._length:
63 read = self._read
64 self._read += size
65 return self._buffer[read:self._read]
66 else:
67 read = self._read
68 self._read = None
69 return self._buffer[read:] + \
70 self.file.read(size-self._length+read)
71
72 def prepend(self, prepend=b'', readprevious=False):
73 if self._read is None:
74 self._buffer = prepend
75 elif readprevious and len(prepend) <= self._read:
76 self._read -= len(prepend)
77 return
78 else:
79 self._buffer = self._buffer[read:] + prepend
80 self._length = len(self._buffer)
81 self._read = 0
82
83 def unused(self):
84 if self._read is None:
85 return b''
86 return self._buffer[self._read:]
87
88 def seek(self, offset, whence=0):
89 # This is only ever called with offset=whence=0
90 if whence == 1 and self._read is not None:
91 if 0 <= offset + self._read <= self._length:
92 self._read += offset
93 return
94 else:
95 offset += self._length - self._read
96 self._read = None
97 self._buffer = None
98 return self.file.seek(offset, whence)
99
100 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +0000101 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102
103
Antoine Pitroub1f88352010-01-03 22:37:40 +0000104class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000105 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000106 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000107
108 """
Guido van Rossum15262191997-04-30 16:04:57 +0000109
Guido van Rossum68de3791997-07-19 20:22:23 +0000110 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000111 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000112
Tim Peters07e99cb2001-01-14 23:47:14 +0000113 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000114 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000115 """Constructor for the GzipFile class.
116
117 At least one of fileobj and filename must be given a
118 non-trivial value.
119
120 The new class instance is based on fileobj, which can be a regular
121 file, a StringIO object, or any other object which simulates a file.
122 It defaults to None, in which case filename is opened to provide
123 a file object.
124
125 When fileobj is not None, the filename argument is only used to be
126 included in the gzip file header, which may includes the original
127 filename of the uncompressed file. It defaults to the filename of
128 fileobj, if discernible; otherwise, it defaults to the empty string,
129 and in this case the original filename is not included in the header.
130
131 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
132 depending on whether the file will be read or written. The default
133 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
134 Be aware that only the 'rb', 'ab', and 'wb' values should be used
135 for cross-platform portability.
136
137 The compresslevel argument is an integer from 1 to 9 controlling the
138 level of compression; 1 is fastest and produces the least compression,
139 and 9 is slowest and produces the most compression. The default is 9.
140
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000141 The mtime argument is an optional numeric timestamp to be written
142 to the stream when compressing. All gzip compressed streams
143 are required to contain a timestamp. If omitted or None, the
144 current time is used. This module ignores the timestamp when
145 decompressing; however, some programs, such as gunzip, make use
146 of it. The format of the timestamp is the same as that of the
147 return value of time.time() and of the st_mtime member of the
148 object returned by os.stat().
149
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000150 """
151
Skip Montanaro12424bc2002-05-23 01:43:05 +0000152 # guarantee the file is opened in binary mode on platforms
153 # that care about that sort of thing
154 if mode and 'b' not in mode:
155 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000156 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000157 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000158 if filename is None:
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200159 if hasattr(fileobj, 'name') and isinstance(fileobj.name, str):
160 filename = fileobj.name
161 else:
162 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000163 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000164 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000165 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000166
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000167 if mode[0:1] == 'r':
168 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000169 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000170 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000171 # Buffer data read from gzip file. extrastart is offset in
172 # stream where buffer starts. extrasize is number of
173 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000174 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000175 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000176 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000177 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000178 # Starts small, scales exponentially
179 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000180 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000181
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000182 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000183 self.mode = WRITE
184 self._init_write(filename)
185 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000186 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000187 -zlib.MAX_WBITS,
188 zlib.DEF_MEM_LEVEL,
189 0)
190 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000191 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000192
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000193 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000194 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000195 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000196
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000197 if self.mode == WRITE:
198 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000199
Thomas Wouterscf297e42007-02-23 15:07:44 +0000200 @property
201 def filename(self):
202 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000203 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000204 if self.mode == WRITE and self.name[-3:] != ".gz":
205 return self.name + ".gz"
206 return self.name
207
Guido van Rossum15262191997-04-30 16:04:57 +0000208 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000209 fileobj = self.fileobj
210 if isinstance(fileobj, _PaddedFile):
211 fileobj = fileobj.file
212 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000214
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000215 def _check_closed(self):
216 """Raises a ValueError if the underlying file object has been closed.
217
218 """
219 if self.closed:
220 raise ValueError('I/O operation on closed file.')
221
Guido van Rossum15262191997-04-30 16:04:57 +0000222 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000223 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000224 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000225 self.size = 0
226 self.writebuf = []
227 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000228
229 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000230 self.fileobj.write(b'\037\213') # magic header
231 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000232 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000233 # RFC 1952 requires the FNAME field to be Latin-1. Do not
234 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000235 fname = os.path.basename(self.name)
236 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000237 if fname.endswith(b'.gz'):
238 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000239 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000240 fname = b''
241 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000242 if fname:
243 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000244 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000245 mtime = self.mtime
246 if mtime is None:
247 mtime = time.time()
248 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000249 self.fileobj.write(b'\002')
250 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000251 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000252 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000253
254 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000255 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000257
258 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000259 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000260 if magic == b'':
261 raise EOFError("Reached EOF")
262
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000263 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000264 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000265 method = ord( self.fileobj.read(1) )
266 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000267 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000268 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000269 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000270 # extraflag = self.fileobj.read(1)
271 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000272 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000273
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000274 if flag & FEXTRA:
275 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000276 xlen = ord(self.fileobj.read(1))
277 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000278 self.fileobj.read(xlen)
279 if flag & FNAME:
280 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000281 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000282 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000283 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000284 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000285 if flag & FCOMMENT:
286 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000287 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000288 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000289 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000290 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000291 if flag & FHCRC:
292 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000293
Antoine Pitrou7b969842010-09-23 16:22:51 +0000294 unused = self.fileobj.unused()
295 if unused:
296 uncompress = self.decompress.decompress(unused)
297 self._add_read_data(uncompress)
298
Guido van Rossum15262191997-04-30 16:04:57 +0000299 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000300 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000301 if self.mode != WRITE:
302 import errno
303 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000304
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000305 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000306 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000307
308 # Convert data type if called by io.BufferedWriter.
309 if isinstance(data, memoryview):
310 data = data.tobytes()
311
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000312 if len(data) > 0:
313 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000314 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000316 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000317
Antoine Pitroub1f88352010-01-03 22:37:40 +0000318 return len(data)
319
Guido van Rossum56068012000-02-02 16:51:06 +0000320 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000321 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000322 if self.mode != READ:
323 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000324 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000325
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000326 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000327 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000328
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000329 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000330 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000331 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000332 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000334 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000335 except EOFError:
336 size = self.extrasize
337 else: # just get some more of it
338 try:
339 while size > self.extrasize:
340 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000341 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000342 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000343 if size > self.extrasize:
344 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000345
Antoine Pitroub1f88352010-01-03 22:37:40 +0000346 offset = self.offset - self.extrastart
347 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000348 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000349
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000350 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000351 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000352
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000353 def peek(self, n):
354 if self.mode != READ:
355 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000356 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000357
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000358 # Do not return ridiculously small buffers, for one common idiom
359 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000360 if n < 100:
361 n = 100
362 if self.extrasize == 0:
363 if self.fileobj is None:
364 return b''
365 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000366 # 1024 is the same buffering heuristic used in read()
367 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000368 except EOFError:
369 pass
370 offset = self.offset - self.extrastart
371 remaining = self.extrasize
372 assert remaining == len(self.extrabuf) - offset
373 return self.extrabuf[offset:offset + n]
374
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000375 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000376 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000377 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000378
379 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000380 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000381 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000382
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000383 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000384 # If the _new_member flag is set, we have to
385 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000386 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000387 self._read_gzip_header()
388 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000389 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000390
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000391 # Read a chunk of data from the file
392 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000393
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000394 # If the EOF has been reached, flush the decompression object
395 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000396
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000397 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000398 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000399 # Prepend the already read bytes to the fileobj to they can be
400 # seen by _read_eof()
401 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000402 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000403 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000404 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000405
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000406 uncompress = self.decompress.decompress(buf)
407 self._add_read_data( uncompress )
408
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000409 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000410 # Ending case: we've come to the end of a member in the file,
411 # so seek back to the start of the unused data, finish up
412 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000413 # Prepend the already read bytes to the fileobj to they can be
414 # seen by _read_eof() and _read_gzip_header()
415 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000416 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000417 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000418 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000419 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000420
421 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000422 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000423 offset = self.offset - self.extrastart
424 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000425 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000426 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000427 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000428
429 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000430 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000431 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000432 # uncompressed data matches the stored values. Note that the size
433 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000434 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000435 isize = read32(self.fileobj) # may exceed 2GB
436 if crc32 != self.crc:
437 raise IOError("CRC check failed %s != %s" % (hex(crc32),
438 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000439 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000440 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000441
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000442 # Gzip files can be padded with zeroes and still have archives.
443 # Consume all zero bytes and set the file position to the first
444 # non-zero byte. See http://www.gzip.org/#faq8
445 c = b"\x00"
446 while c == b"\x00":
447 c = self.fileobj.read(1)
448 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000449 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000450
Antoine Pitroub1f88352010-01-03 22:37:40 +0000451 @property
452 def closed(self):
453 return self.fileobj is None
454
Guido van Rossum15262191997-04-30 16:04:57 +0000455 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000456 if self.fileobj is None:
457 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000458 if self.mode == WRITE:
459 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000460 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000461 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000462 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000463 self.fileobj = None
464 elif self.mode == READ:
465 self.fileobj = None
466 if self.myfileobj:
467 self.myfileobj.close()
468 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000469
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000470 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000471 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000472 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000473 # Ensure the compressor's buffer is flushed
474 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000475 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000476
Tim Peters5cfb05e2004-07-27 21:02:02 +0000477 def fileno(self):
478 """Invoke the underlying file object's fileno() method.
479
480 This will raise AttributeError if the underlying file object
481 doesn't support fileno().
482 """
483 return self.fileobj.fileno()
484
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000485 def rewind(self):
486 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000487 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000488 if self.mode != READ:
489 raise IOError("Can't rewind in write mode")
490 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000491 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000492 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000493 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000494 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000495 self.offset = 0
496
Antoine Pitroub1f88352010-01-03 22:37:40 +0000497 def readable(self):
498 return self.mode == READ
499
500 def writable(self):
501 return self.mode == WRITE
502
503 def seekable(self):
504 return True
505
Thomas Wouters89f507f2006-12-13 04:49:30 +0000506 def seek(self, offset, whence=0):
507 if whence:
508 if whence == 1:
509 offset = self.offset + offset
510 else:
511 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000512 if self.mode == WRITE:
513 if offset < self.offset:
514 raise IOError('Negative seek in write mode')
515 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000516 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000517 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000518 self.write(chunk)
519 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000520 elif self.mode == READ:
521 if offset < self.offset:
522 # for negative seek, rewind and do positive seek
523 self.rewind()
524 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000525 for i in range(count // 1024):
526 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000527 self.read(count % 1024)
528
Antoine Pitroub1f88352010-01-03 22:37:40 +0000529 return self.offset
530
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000531 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000533 # Shortcut common case - newline found in buffer.
534 offset = self.offset - self.extrastart
535 i = self.extrabuf.find(b'\n', offset) + 1
536 if i > 0:
537 self.extrasize -= i - offset
538 self.offset += i - offset
539 return self.extrabuf[offset: i]
540
Christian Heimesa37d4c62007-12-04 23:02:19 +0000541 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542 readsize = self.min_readsize
543 else:
544 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000545 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000547 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000548 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
550 # We set i=size to break out of the loop under two
551 # conditions: 1) there's no newline, and the chunk is
552 # larger than size, or 2) there is a newline, but the
553 # resulting line would be longer than 'size'.
554 if (size <= i) or (i == -1 and len(c) > size):
555 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000556
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000557 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558 bufs.append(c[:i + 1]) # Add portion of last chunk
559 self._unread(c[i + 1:]) # Push back rest of chunk
560 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000561
562 # Append chunk to list, decrease 'size',
563 bufs.append(c)
564 size = size - len(c)
565 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000566 if readsize > self.min_readsize:
567 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000568 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000569
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000570
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000571def compress(data, compresslevel=9):
572 """Compress data in one shot and return the compressed string.
573 Optional argument is the compression level, in range of 1-9.
574 """
575 buf = io.BytesIO()
576 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
577 f.write(data)
578 return buf.getvalue()
579
580def decompress(data):
581 """Decompress a gzip compressed string in one shot.
582 Return the decompressed string.
583 """
584 with GzipFile(fileobj=io.BytesIO(data)) as f:
585 return f.read()
586
587
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000588def _test():
589 # Act like gzip; with -d, act like gunzip.
590 # The input file is not deleted, however, nor are any other gzip
591 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000592 args = sys.argv[1:]
593 decompress = args and args[0] == "-d"
594 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000595 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000596 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000597 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000598 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000599 if decompress:
600 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000601 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
602 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000603 else:
604 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000605 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000606 continue
607 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000608 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000609 else:
610 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000611 f = sys.stdin.buffer
612 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000613 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000614 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000615 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000616 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000617 chunk = f.read(1024)
618 if not chunk:
619 break
620 g.write(chunk)
621 if g is not sys.stdout:
622 g.close()
623 if f is not sys.stdin:
624 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000625
626if __name__ == '__main__':
627 _test()