blob: 6aacc9a4f96c7480bdf8318cda419a03872e6001 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Tim Petersfb0ea522002-11-04 19:50:11 +000019def U32(i):
20 """Return i as an unsigned integer, assuming it fits in 32 bits.
Tim Petersfb0ea522002-11-04 19:50:11 +000021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
Guido van Rossume2a383d2007-01-15 16:59:06 +000024 i += 1 << 32
Tim Petersfb0ea522002-11-04 19:50:11 +000025 return i
26
Tim Peters9288f952002-11-05 20:38:55 +000027def LOWU32(i):
Christian Heimesfe337bf2008-03-23 21:54:12 +000028 """Return the low-order 32 bits, as a non-negative int"""
Guido van Rossume2a383d2007-01-15 16:59:06 +000029 return i & 0xFFFFFFFF
Tim Peters9288f952002-11-05 20:38:55 +000030
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000031def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000032 # The L format writes the bit pattern correctly whether signed
33 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000034 output.write(struct.pack("<L", value))
35
Guido van Rossum15262191997-04-30 16:04:57 +000036def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000037 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000038
Fred Drakefa1591c1999-04-05 18:37:59 +000039def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000040 """Shorthand for GzipFile(filename, mode, compresslevel).
41
42 The filename argument is required; mode defaults to 'rb'
43 and compresslevel defaults to 9.
44
45 """
Guido van Rossum15262191997-04-30 16:04:57 +000046 return GzipFile(filename, mode, compresslevel)
47
Antoine Pitrou7b969842010-09-23 16:22:51 +000048class _PaddedFile:
49 """Minimal read-only file object that prepends a string to the contents
50 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
51 essential functionality."""
52
53 def __init__(self, f, prepend=b''):
54 self._buffer = prepend
55 self._length = len(prepend)
56 self.file = f
57 self._read = 0
58
59 def read(self, size):
60 if self._read is None:
61 return self.file.read(size)
62 if self._read + size <= self._length:
63 read = self._read
64 self._read += size
65 return self._buffer[read:self._read]
66 else:
67 read = self._read
68 self._read = None
69 return self._buffer[read:] + \
70 self.file.read(size-self._length+read)
71
72 def prepend(self, prepend=b'', readprevious=False):
73 if self._read is None:
74 self._buffer = prepend
75 elif readprevious and len(prepend) <= self._read:
76 self._read -= len(prepend)
77 return
78 else:
79 self._buffer = self._buffer[read:] + prepend
80 self._length = len(self._buffer)
81 self._read = 0
82
83 def unused(self):
84 if self._read is None:
85 return b''
86 return self._buffer[self._read:]
87
88 def seek(self, offset, whence=0):
89 # This is only ever called with offset=whence=0
90 if whence == 1 and self._read is not None:
91 if 0 <= offset + self._read <= self._length:
92 self._read += offset
93 return
94 else:
95 offset += self._length - self._read
96 self._read = None
97 self._buffer = None
98 return self.file.seek(offset, whence)
99
100 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +0000101 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102
103
Antoine Pitroub1f88352010-01-03 22:37:40 +0000104class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000105 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000106 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000107
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200108 This class only supports opening files in binary mode. If you need to open a
109 compressed file in text mode, wrap your GzipFile with an io.TextIOWrapper.
110
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000111 """
Guido van Rossum15262191997-04-30 16:04:57 +0000112
Guido van Rossum68de3791997-07-19 20:22:23 +0000113 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000114 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000115
Tim Peters07e99cb2001-01-14 23:47:14 +0000116 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000117 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000118 """Constructor for the GzipFile class.
119
120 At least one of fileobj and filename must be given a
121 non-trivial value.
122
123 The new class instance is based on fileobj, which can be a regular
124 file, a StringIO object, or any other object which simulates a file.
125 It defaults to None, in which case filename is opened to provide
126 a file object.
127
128 When fileobj is not None, the filename argument is only used to be
129 included in the gzip file header, which may includes the original
130 filename of the uncompressed file. It defaults to the filename of
131 fileobj, if discernible; otherwise, it defaults to the empty string,
132 and in this case the original filename is not included in the header.
133
134 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
135 depending on whether the file will be read or written. The default
136 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200137 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
138 'wb', and 'a' and 'ab'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000139
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100140 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000141 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100142 and 9 is slowest and produces the most compression. 0 is no compression
143 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000144
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000145 The mtime argument is an optional numeric timestamp to be written
146 to the stream when compressing. All gzip compressed streams
147 are required to contain a timestamp. If omitted or None, the
148 current time is used. This module ignores the timestamp when
149 decompressing; however, some programs, such as gunzip, make use
150 of it. The format of the timestamp is the same as that of the
151 return value of time.time() and of the st_mtime member of the
152 object returned by os.stat().
153
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000154 """
155
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200156 if mode and ('t' in mode or 'U' in mode):
157 raise IOError("Mode " + mode + " not supported")
Skip Montanaro12424bc2002-05-23 01:43:05 +0000158 if mode and 'b' not in mode:
159 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000160 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000161 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000162 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200163 filename = getattr(fileobj, 'name', '')
164 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200165 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000166 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000167 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000168 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000169
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000170 if mode[0:1] == 'r':
171 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000172 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000173 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000174 # Buffer data read from gzip file. extrastart is offset in
175 # stream where buffer starts. extrasize is number of
176 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000177 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000178 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000179 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000180 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000181 # Starts small, scales exponentially
182 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000183 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000184
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000185 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000186 self.mode = WRITE
187 self._init_write(filename)
188 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000189 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000190 -zlib.MAX_WBITS,
191 zlib.DEF_MEM_LEVEL,
192 0)
193 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000194 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000195
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000196 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000197 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000198 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000199
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000200 if self.mode == WRITE:
201 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000202
Thomas Wouterscf297e42007-02-23 15:07:44 +0000203 @property
204 def filename(self):
205 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000206 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000207 if self.mode == WRITE and self.name[-3:] != ".gz":
208 return self.name + ".gz"
209 return self.name
210
Guido van Rossum15262191997-04-30 16:04:57 +0000211 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000212 fileobj = self.fileobj
213 if isinstance(fileobj, _PaddedFile):
214 fileobj = fileobj.file
215 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000216 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000217
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000218 def _check_closed(self):
219 """Raises a ValueError if the underlying file object has been closed.
220
221 """
222 if self.closed:
223 raise ValueError('I/O operation on closed file.')
224
Guido van Rossum15262191997-04-30 16:04:57 +0000225 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000226 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000227 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000228 self.size = 0
229 self.writebuf = []
230 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000231
232 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000233 self.fileobj.write(b'\037\213') # magic header
234 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000235 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000236 # RFC 1952 requires the FNAME field to be Latin-1. Do not
237 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000238 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200239 if not isinstance(fname, bytes):
240 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000241 if fname.endswith(b'.gz'):
242 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000243 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000244 fname = b''
245 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000246 if fname:
247 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000248 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000249 mtime = self.mtime
250 if mtime is None:
251 mtime = time.time()
252 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000253 self.fileobj.write(b'\002')
254 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000255 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000256 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000257
258 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000259 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000260 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000261
262 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000263 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000264 if magic == b'':
265 raise EOFError("Reached EOF")
266
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000267 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000268 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000269 method = ord( self.fileobj.read(1) )
270 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000271 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000272 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000273 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000274 # extraflag = self.fileobj.read(1)
275 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000276 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000277
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000278 if flag & FEXTRA:
279 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000280 xlen = ord(self.fileobj.read(1))
281 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000282 self.fileobj.read(xlen)
283 if flag & FNAME:
284 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000285 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000286 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000287 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000288 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000289 if flag & FCOMMENT:
290 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000291 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000292 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000293 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000294 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000295 if flag & FHCRC:
296 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000297
Antoine Pitrou7b969842010-09-23 16:22:51 +0000298 unused = self.fileobj.unused()
299 if unused:
300 uncompress = self.decompress.decompress(unused)
301 self._add_read_data(uncompress)
302
Guido van Rossum15262191997-04-30 16:04:57 +0000303 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000304 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000305 if self.mode != WRITE:
306 import errno
307 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000308
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000309 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000310 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000311
312 # Convert data type if called by io.BufferedWriter.
313 if isinstance(data, memoryview):
314 data = data.tobytes()
315
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000316 if len(data) > 0:
317 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000318 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000320 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000321
Antoine Pitroub1f88352010-01-03 22:37:40 +0000322 return len(data)
323
Guido van Rossum56068012000-02-02 16:51:06 +0000324 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000325 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000326 if self.mode != READ:
327 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000328 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000329
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000330 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000331 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000332
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000334 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000335 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000336 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000337 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000338 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000339 except EOFError:
340 size = self.extrasize
341 else: # just get some more of it
342 try:
343 while size > self.extrasize:
344 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000345 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000346 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000347 if size > self.extrasize:
348 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000349
Antoine Pitroub1f88352010-01-03 22:37:40 +0000350 offset = self.offset - self.extrastart
351 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000352 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000353
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000354 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000355 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000356
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000357 def peek(self, n):
358 if self.mode != READ:
359 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000360 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000361
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000362 # Do not return ridiculously small buffers, for one common idiom
363 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000364 if n < 100:
365 n = 100
366 if self.extrasize == 0:
367 if self.fileobj is None:
368 return b''
369 try:
Serhiy Storchakafc6e8aa2013-01-22 15:54:48 +0200370 # Ensure that we don't return b"" if we haven't reached EOF.
371 while self.extrasize == 0:
372 # 1024 is the same buffering heuristic used in read()
373 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000374 except EOFError:
375 pass
376 offset = self.offset - self.extrastart
377 remaining = self.extrasize
378 assert remaining == len(self.extrabuf) - offset
379 return self.extrabuf[offset:offset + n]
380
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000381 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000382 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000383 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000384
385 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000386 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000387 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000388
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000389 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000390 # If the _new_member flag is set, we have to
391 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000392 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000393 self._read_gzip_header()
394 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000395 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000396
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000397 # Read a chunk of data from the file
398 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000399
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000400 # If the EOF has been reached, flush the decompression object
401 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000402
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000403 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000404 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000405 # Prepend the already read bytes to the fileobj to they can be
406 # seen by _read_eof()
407 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000408 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000409 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000410 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000411
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000412 uncompress = self.decompress.decompress(buf)
413 self._add_read_data( uncompress )
414
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000415 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000416 # Ending case: we've come to the end of a member in the file,
417 # so seek back to the start of the unused data, finish up
418 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000419 # Prepend the already read bytes to the fileobj to they can be
420 # seen by _read_eof() and _read_gzip_header()
421 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000422 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000423 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000424 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000425 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000426
427 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000428 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000429 offset = self.offset - self.extrastart
430 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000431 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000432 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000433 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000434
435 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000436 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000437 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000438 # uncompressed data matches the stored values. Note that the size
439 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000440 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000441 isize = read32(self.fileobj) # may exceed 2GB
442 if crc32 != self.crc:
443 raise IOError("CRC check failed %s != %s" % (hex(crc32),
444 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000445 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000446 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000447
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000448 # Gzip files can be padded with zeroes and still have archives.
449 # Consume all zero bytes and set the file position to the first
450 # non-zero byte. See http://www.gzip.org/#faq8
451 c = b"\x00"
452 while c == b"\x00":
453 c = self.fileobj.read(1)
454 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000455 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000456
Antoine Pitroub1f88352010-01-03 22:37:40 +0000457 @property
458 def closed(self):
459 return self.fileobj is None
460
Guido van Rossum15262191997-04-30 16:04:57 +0000461 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000462 if self.fileobj is None:
463 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000464 if self.mode == WRITE:
465 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000466 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000467 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000468 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000469 self.fileobj = None
470 elif self.mode == READ:
471 self.fileobj = None
472 if self.myfileobj:
473 self.myfileobj.close()
474 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000475
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000476 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000477 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000478 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000479 # Ensure the compressor's buffer is flushed
480 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000481 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000482
Tim Peters5cfb05e2004-07-27 21:02:02 +0000483 def fileno(self):
484 """Invoke the underlying file object's fileno() method.
485
486 This will raise AttributeError if the underlying file object
487 doesn't support fileno().
488 """
489 return self.fileobj.fileno()
490
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000491 def rewind(self):
492 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000493 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000494 if self.mode != READ:
495 raise IOError("Can't rewind in write mode")
496 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000497 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000498 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000499 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000500 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000501 self.offset = 0
502
Antoine Pitroub1f88352010-01-03 22:37:40 +0000503 def readable(self):
504 return self.mode == READ
505
506 def writable(self):
507 return self.mode == WRITE
508
509 def seekable(self):
510 return True
511
Thomas Wouters89f507f2006-12-13 04:49:30 +0000512 def seek(self, offset, whence=0):
513 if whence:
514 if whence == 1:
515 offset = self.offset + offset
516 else:
517 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000518 if self.mode == WRITE:
519 if offset < self.offset:
520 raise IOError('Negative seek in write mode')
521 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000522 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000523 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000524 self.write(chunk)
525 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000526 elif self.mode == READ:
527 if offset < self.offset:
528 # for negative seek, rewind and do positive seek
529 self.rewind()
530 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000531 for i in range(count // 1024):
532 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000533 self.read(count % 1024)
534
Antoine Pitroub1f88352010-01-03 22:37:40 +0000535 return self.offset
536
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000537 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000539 # Shortcut common case - newline found in buffer.
540 offset = self.offset - self.extrastart
541 i = self.extrabuf.find(b'\n', offset) + 1
542 if i > 0:
543 self.extrasize -= i - offset
544 self.offset += i - offset
545 return self.extrabuf[offset: i]
546
Christian Heimesa37d4c62007-12-04 23:02:19 +0000547 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548 readsize = self.min_readsize
549 else:
550 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000551 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000553 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000554 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
556 # We set i=size to break out of the loop under two
557 # conditions: 1) there's no newline, and the chunk is
558 # larger than size, or 2) there is a newline, but the
559 # resulting line would be longer than 'size'.
560 if (size <= i) or (i == -1 and len(c) > size):
561 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000562
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000563 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564 bufs.append(c[:i + 1]) # Add portion of last chunk
565 self._unread(c[i + 1:]) # Push back rest of chunk
566 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000567
568 # Append chunk to list, decrease 'size',
569 bufs.append(c)
570 size = size - len(c)
571 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572 if readsize > self.min_readsize:
573 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000574 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000575
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000576
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000577def compress(data, compresslevel=9):
578 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100579 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000580 """
581 buf = io.BytesIO()
582 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
583 f.write(data)
584 return buf.getvalue()
585
586def decompress(data):
587 """Decompress a gzip compressed string in one shot.
588 Return the decompressed string.
589 """
590 with GzipFile(fileobj=io.BytesIO(data)) as f:
591 return f.read()
592
593
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000594def _test():
595 # Act like gzip; with -d, act like gunzip.
596 # The input file is not deleted, however, nor are any other gzip
597 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000598 args = sys.argv[1:]
599 decompress = args and args[0] == "-d"
600 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000601 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000602 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000603 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000604 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000605 if decompress:
606 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000607 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
608 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000609 else:
610 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000611 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000612 continue
613 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000614 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000615 else:
616 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000617 f = sys.stdin.buffer
618 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000619 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000620 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000621 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000622 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000623 chunk = f.read(1024)
624 if not chunk:
625 break
626 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200627 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000628 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200629 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000630 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000631
632if __name__ == '__main__':
633 _test()