blob: ba2149ebf970938c44998a54d3f84bdd264861ad [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Tim Petersfb0ea522002-11-04 19:50:11 +000019def U32(i):
20 """Return i as an unsigned integer, assuming it fits in 32 bits.
Tim Petersfb0ea522002-11-04 19:50:11 +000021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
Guido van Rossume2a383d2007-01-15 16:59:06 +000024 i += 1 << 32
Tim Petersfb0ea522002-11-04 19:50:11 +000025 return i
26
Tim Peters9288f952002-11-05 20:38:55 +000027def LOWU32(i):
Christian Heimesfe337bf2008-03-23 21:54:12 +000028 """Return the low-order 32 bits, as a non-negative int"""
Guido van Rossume2a383d2007-01-15 16:59:06 +000029 return i & 0xFFFFFFFF
Tim Peters9288f952002-11-05 20:38:55 +000030
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000031def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000032 # The L format writes the bit pattern correctly whether signed
33 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000034 output.write(struct.pack("<L", value))
35
Guido van Rossum15262191997-04-30 16:04:57 +000036def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000037 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000038
Fred Drakefa1591c1999-04-05 18:37:59 +000039def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000040 """Shorthand for GzipFile(filename, mode, compresslevel).
41
42 The filename argument is required; mode defaults to 'rb'
43 and compresslevel defaults to 9.
44
45 """
Guido van Rossum15262191997-04-30 16:04:57 +000046 return GzipFile(filename, mode, compresslevel)
47
Antoine Pitrou7b969842010-09-23 16:22:51 +000048class _PaddedFile:
49 """Minimal read-only file object that prepends a string to the contents
50 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
51 essential functionality."""
52
53 def __init__(self, f, prepend=b''):
54 self._buffer = prepend
55 self._length = len(prepend)
56 self.file = f
57 self._read = 0
58
59 def read(self, size):
60 if self._read is None:
61 return self.file.read(size)
62 if self._read + size <= self._length:
63 read = self._read
64 self._read += size
65 return self._buffer[read:self._read]
66 else:
67 read = self._read
68 self._read = None
69 return self._buffer[read:] + \
70 self.file.read(size-self._length+read)
71
72 def prepend(self, prepend=b'', readprevious=False):
73 if self._read is None:
74 self._buffer = prepend
75 elif readprevious and len(prepend) <= self._read:
76 self._read -= len(prepend)
77 return
78 else:
79 self._buffer = self._buffer[read:] + prepend
80 self._length = len(self._buffer)
81 self._read = 0
82
83 def unused(self):
84 if self._read is None:
85 return b''
86 return self._buffer[self._read:]
87
88 def seek(self, offset, whence=0):
89 # This is only ever called with offset=whence=0
90 if whence == 1 and self._read is not None:
91 if 0 <= offset + self._read <= self._length:
92 self._read += offset
93 return
94 else:
95 offset += self._length - self._read
96 self._read = None
97 self._buffer = None
98 return self.file.seek(offset, whence)
99
100 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +0000101 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102
103
Antoine Pitroub1f88352010-01-03 22:37:40 +0000104class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000105 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000106 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000107
108 """
Guido van Rossum15262191997-04-30 16:04:57 +0000109
Guido van Rossum68de3791997-07-19 20:22:23 +0000110 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000111 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000112
Tim Peters07e99cb2001-01-14 23:47:14 +0000113 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000114 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000115 """Constructor for the GzipFile class.
116
117 At least one of fileobj and filename must be given a
118 non-trivial value.
119
120 The new class instance is based on fileobj, which can be a regular
121 file, a StringIO object, or any other object which simulates a file.
122 It defaults to None, in which case filename is opened to provide
123 a file object.
124
125 When fileobj is not None, the filename argument is only used to be
126 included in the gzip file header, which may includes the original
127 filename of the uncompressed file. It defaults to the filename of
128 fileobj, if discernible; otherwise, it defaults to the empty string,
129 and in this case the original filename is not included in the header.
130
131 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
132 depending on whether the file will be read or written. The default
133 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
134 Be aware that only the 'rb', 'ab', and 'wb' values should be used
135 for cross-platform portability.
136
137 The compresslevel argument is an integer from 1 to 9 controlling the
138 level of compression; 1 is fastest and produces the least compression,
139 and 9 is slowest and produces the most compression. The default is 9.
140
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000141 The mtime argument is an optional numeric timestamp to be written
142 to the stream when compressing. All gzip compressed streams
143 are required to contain a timestamp. If omitted or None, the
144 current time is used. This module ignores the timestamp when
145 decompressing; however, some programs, such as gunzip, make use
146 of it. The format of the timestamp is the same as that of the
147 return value of time.time() and of the st_mtime member of the
148 object returned by os.stat().
149
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000150 """
151
Skip Montanaro12424bc2002-05-23 01:43:05 +0000152 # guarantee the file is opened in binary mode on platforms
153 # that care about that sort of thing
154 if mode and 'b' not in mode:
155 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000156 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000157 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000158 if filename is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 if hasattr(fileobj, 'name'): filename = fileobj.name
160 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000161 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000162 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000163 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000164
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000165 if mode[0:1] == 'r':
166 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000167 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000168 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000169 # Buffer data read from gzip file. extrastart is offset in
170 # stream where buffer starts. extrasize is number of
171 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000172 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000173 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000174 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000175 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000176 # Starts small, scales exponentially
177 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000178 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000179
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000180 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000181 self.mode = WRITE
182 self._init_write(filename)
183 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000184 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 -zlib.MAX_WBITS,
186 zlib.DEF_MEM_LEVEL,
187 0)
188 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000189 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000190
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000191 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000192 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000193 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000194
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 if self.mode == WRITE:
196 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000197
Thomas Wouterscf297e42007-02-23 15:07:44 +0000198 @property
199 def filename(self):
200 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000201 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000202 if self.mode == WRITE and self.name[-3:] != ".gz":
203 return self.name + ".gz"
204 return self.name
205
Guido van Rossum15262191997-04-30 16:04:57 +0000206 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000207 fileobj = self.fileobj
208 if isinstance(fileobj, _PaddedFile):
209 fileobj = fileobj.file
210 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000212
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000213 def _check_closed(self):
214 """Raises a ValueError if the underlying file object has been closed.
215
216 """
217 if self.closed:
218 raise ValueError('I/O operation on closed file.')
219
Guido van Rossum15262191997-04-30 16:04:57 +0000220 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000221 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000222 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000223 self.size = 0
224 self.writebuf = []
225 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000226
227 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000228 self.fileobj.write(b'\037\213') # magic header
229 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000230 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000231 # RFC 1952 requires the FNAME field to be Latin-1. Do not
232 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000233 fname = os.path.basename(self.name)
234 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000235 if fname.endswith(b'.gz'):
236 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000237 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000238 fname = b''
239 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000240 if fname:
241 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000242 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000243 mtime = self.mtime
244 if mtime is None:
245 mtime = time.time()
246 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000247 self.fileobj.write(b'\002')
248 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000249 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000250 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000251
252 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000253 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000255
256 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000257 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000258 if magic == b'':
259 raise EOFError("Reached EOF")
260
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000261 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000262 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000263 method = ord( self.fileobj.read(1) )
264 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000265 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000266 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000267 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000268 # extraflag = self.fileobj.read(1)
269 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000270 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000271
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000272 if flag & FEXTRA:
273 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000274 xlen = ord(self.fileobj.read(1))
275 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 self.fileobj.read(xlen)
277 if flag & FNAME:
278 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000279 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000280 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000281 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000282 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000283 if flag & FCOMMENT:
284 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000285 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000286 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000287 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000288 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000289 if flag & FHCRC:
290 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000291
Antoine Pitrou7b969842010-09-23 16:22:51 +0000292 unused = self.fileobj.unused()
293 if unused:
294 uncompress = self.decompress.decompress(unused)
295 self._add_read_data(uncompress)
296
Guido van Rossum15262191997-04-30 16:04:57 +0000297 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000298 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000299 if self.mode != WRITE:
300 import errno
301 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000302
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000304 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000305
306 # Convert data type if called by io.BufferedWriter.
307 if isinstance(data, memoryview):
308 data = data.tobytes()
309
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000310 if len(data) > 0:
311 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000312 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000313 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000314 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000315
Antoine Pitroub1f88352010-01-03 22:37:40 +0000316 return len(data)
317
Guido van Rossum56068012000-02-02 16:51:06 +0000318 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000319 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000320 if self.mode != READ:
321 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000322 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000323
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000324 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000325 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000326
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000327 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000328 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000329 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000330 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000331 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000332 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 except EOFError:
334 size = self.extrasize
335 else: # just get some more of it
336 try:
337 while size > self.extrasize:
338 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000339 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000340 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000341 if size > self.extrasize:
342 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000343
Antoine Pitroub1f88352010-01-03 22:37:40 +0000344 offset = self.offset - self.extrastart
345 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000346 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000347
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000348 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000349 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000350
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000351 def peek(self, n):
352 if self.mode != READ:
353 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000354 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000355
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000356 # Do not return ridiculously small buffers, for one common idiom
357 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000358 if n < 100:
359 n = 100
360 if self.extrasize == 0:
361 if self.fileobj is None:
362 return b''
363 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000364 # 1024 is the same buffering heuristic used in read()
365 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000366 except EOFError:
367 pass
368 offset = self.offset - self.extrastart
369 remaining = self.extrasize
370 assert remaining == len(self.extrabuf) - offset
371 return self.extrabuf[offset:offset + n]
372
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000373 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000374 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000375 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000376
377 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000378 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000379 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000380
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000381 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000382 # If the _new_member flag is set, we have to
383 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000384 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000385 self._read_gzip_header()
386 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000387 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000388
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000389 # Read a chunk of data from the file
390 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000391
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000392 # If the EOF has been reached, flush the decompression object
393 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000394
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000395 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000396 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000397 # Prepend the already read bytes to the fileobj to they can be
398 # seen by _read_eof()
399 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000400 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000401 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000402 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000403
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000404 uncompress = self.decompress.decompress(buf)
405 self._add_read_data( uncompress )
406
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000407 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000408 # Ending case: we've come to the end of a member in the file,
409 # so seek back to the start of the unused data, finish up
410 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000411 # Prepend the already read bytes to the fileobj to they can be
412 # seen by _read_eof() and _read_gzip_header()
413 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000414 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000415 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000416 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000417 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000418
419 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000420 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000421 offset = self.offset - self.extrastart
422 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000423 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000424 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000425 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000426
427 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000428 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000429 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000430 # uncompressed data matches the stored values. Note that the size
431 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000432 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000433 isize = read32(self.fileobj) # may exceed 2GB
434 if crc32 != self.crc:
435 raise IOError("CRC check failed %s != %s" % (hex(crc32),
436 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000437 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000438 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000439
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000440 # Gzip files can be padded with zeroes and still have archives.
441 # Consume all zero bytes and set the file position to the first
442 # non-zero byte. See http://www.gzip.org/#faq8
443 c = b"\x00"
444 while c == b"\x00":
445 c = self.fileobj.read(1)
446 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000447 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000448
Antoine Pitroub1f88352010-01-03 22:37:40 +0000449 @property
450 def closed(self):
451 return self.fileobj is None
452
Guido van Rossum15262191997-04-30 16:04:57 +0000453 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000454 if self.fileobj is None:
455 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000456 if self.mode == WRITE:
457 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000458 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000459 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000460 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000461 self.fileobj = None
462 elif self.mode == READ:
463 self.fileobj = None
464 if self.myfileobj:
465 self.myfileobj.close()
466 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000467
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000468 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000469 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000470 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000471 # Ensure the compressor's buffer is flushed
472 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000473 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000474
Tim Peters5cfb05e2004-07-27 21:02:02 +0000475 def fileno(self):
476 """Invoke the underlying file object's fileno() method.
477
478 This will raise AttributeError if the underlying file object
479 doesn't support fileno().
480 """
481 return self.fileobj.fileno()
482
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000483 def rewind(self):
484 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000485 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000486 if self.mode != READ:
487 raise IOError("Can't rewind in write mode")
488 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000489 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000490 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000491 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000492 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000493 self.offset = 0
494
Antoine Pitroub1f88352010-01-03 22:37:40 +0000495 def readable(self):
496 return self.mode == READ
497
498 def writable(self):
499 return self.mode == WRITE
500
501 def seekable(self):
502 return True
503
Thomas Wouters89f507f2006-12-13 04:49:30 +0000504 def seek(self, offset, whence=0):
505 if whence:
506 if whence == 1:
507 offset = self.offset + offset
508 else:
509 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000510 if self.mode == WRITE:
511 if offset < self.offset:
512 raise IOError('Negative seek in write mode')
513 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000514 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000515 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000516 self.write(chunk)
517 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000518 elif self.mode == READ:
519 if offset < self.offset:
520 # for negative seek, rewind and do positive seek
521 self.rewind()
522 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000523 for i in range(count // 1024):
524 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000525 self.read(count % 1024)
526
Antoine Pitroub1f88352010-01-03 22:37:40 +0000527 return self.offset
528
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000529 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000530 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000531 # Shortcut common case - newline found in buffer.
532 offset = self.offset - self.extrastart
533 i = self.extrabuf.find(b'\n', offset) + 1
534 if i > 0:
535 self.extrasize -= i - offset
536 self.offset += i - offset
537 return self.extrabuf[offset: i]
538
Christian Heimesa37d4c62007-12-04 23:02:19 +0000539 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540 readsize = self.min_readsize
541 else:
542 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000543 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000545 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000546 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
548 # We set i=size to break out of the loop under two
549 # conditions: 1) there's no newline, and the chunk is
550 # larger than size, or 2) there is a newline, but the
551 # resulting line would be longer than 'size'.
552 if (size <= i) or (i == -1 and len(c) > size):
553 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000554
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000555 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556 bufs.append(c[:i + 1]) # Add portion of last chunk
557 self._unread(c[i + 1:]) # Push back rest of chunk
558 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000559
560 # Append chunk to list, decrease 'size',
561 bufs.append(c)
562 size = size - len(c)
563 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564 if readsize > self.min_readsize:
565 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000566 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000567
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000568
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000569def compress(data, compresslevel=9):
570 """Compress data in one shot and return the compressed string.
571 Optional argument is the compression level, in range of 1-9.
572 """
573 buf = io.BytesIO()
574 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
575 f.write(data)
576 return buf.getvalue()
577
578def decompress(data):
579 """Decompress a gzip compressed string in one shot.
580 Return the decompressed string.
581 """
582 with GzipFile(fileobj=io.BytesIO(data)) as f:
583 return f.read()
584
585
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000586def _test():
587 # Act like gzip; with -d, act like gunzip.
588 # The input file is not deleted, however, nor are any other gzip
589 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000590 args = sys.argv[1:]
591 decompress = args and args[0] == "-d"
592 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000593 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000594 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000595 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000596 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000597 if decompress:
598 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000599 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
600 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000601 else:
602 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000603 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000604 continue
605 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000606 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000607 else:
608 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000609 f = sys.stdin.buffer
610 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000611 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000612 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000613 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000614 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000615 chunk = f.read(1024)
616 if not chunk:
617 break
618 g.write(chunk)
619 if g is not sys.stdout:
620 g.close()
621 if f is not sys.stdin:
622 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000623
624if __name__ == '__main__':
625 _test()