blob: f8cd2a1bd6ef7ff9218d7be00354e2c3a7da9bcd [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Tim Petersfb0ea522002-11-04 19:50:11 +000019def U32(i):
20 """Return i as an unsigned integer, assuming it fits in 32 bits.
Tim Petersfb0ea522002-11-04 19:50:11 +000021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
Guido van Rossume2a383d2007-01-15 16:59:06 +000024 i += 1 << 32
Tim Petersfb0ea522002-11-04 19:50:11 +000025 return i
26
Tim Peters9288f952002-11-05 20:38:55 +000027def LOWU32(i):
Christian Heimesfe337bf2008-03-23 21:54:12 +000028 """Return the low-order 32 bits, as a non-negative int"""
Guido van Rossume2a383d2007-01-15 16:59:06 +000029 return i & 0xFFFFFFFF
Tim Peters9288f952002-11-05 20:38:55 +000030
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000031def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000032 # The L format writes the bit pattern correctly whether signed
33 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000034 output.write(struct.pack("<L", value))
35
Guido van Rossum15262191997-04-30 16:04:57 +000036def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000037 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000038
Fred Drakefa1591c1999-04-05 18:37:59 +000039def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000040 """Shorthand for GzipFile(filename, mode, compresslevel).
41
42 The filename argument is required; mode defaults to 'rb'
43 and compresslevel defaults to 9.
44
45 """
Guido van Rossum15262191997-04-30 16:04:57 +000046 return GzipFile(filename, mode, compresslevel)
47
Antoine Pitrou7b969842010-09-23 16:22:51 +000048class _PaddedFile:
49 """Minimal read-only file object that prepends a string to the contents
50 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
51 essential functionality."""
52
53 def __init__(self, f, prepend=b''):
54 self._buffer = prepend
55 self._length = len(prepend)
56 self.file = f
57 self._read = 0
58
59 def read(self, size):
60 if self._read is None:
61 return self.file.read(size)
62 if self._read + size <= self._length:
63 read = self._read
64 self._read += size
65 return self._buffer[read:self._read]
66 else:
67 read = self._read
68 self._read = None
69 return self._buffer[read:] + \
70 self.file.read(size-self._length+read)
71
72 def prepend(self, prepend=b'', readprevious=False):
73 if self._read is None:
74 self._buffer = prepend
75 elif readprevious and len(prepend) <= self._read:
76 self._read -= len(prepend)
77 return
78 else:
79 self._buffer = self._buffer[read:] + prepend
80 self._length = len(self._buffer)
81 self._read = 0
82
83 def unused(self):
84 if self._read is None:
85 return b''
86 return self._buffer[self._read:]
87
88 def seek(self, offset, whence=0):
89 # This is only ever called with offset=whence=0
90 if whence == 1 and self._read is not None:
91 if 0 <= offset + self._read <= self._length:
92 self._read += offset
93 return
94 else:
95 offset += self._length - self._read
96 self._read = None
97 self._buffer = None
98 return self.file.seek(offset, whence)
99
100 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +0000101 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000102
103
Antoine Pitroub1f88352010-01-03 22:37:40 +0000104class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000105 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000106 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000107
108 """
Guido van Rossum15262191997-04-30 16:04:57 +0000109
Guido van Rossum68de3791997-07-19 20:22:23 +0000110 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000111 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000112
Tim Peters07e99cb2001-01-14 23:47:14 +0000113 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000114 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000115 """Constructor for the GzipFile class.
116
117 At least one of fileobj and filename must be given a
118 non-trivial value.
119
120 The new class instance is based on fileobj, which can be a regular
121 file, a StringIO object, or any other object which simulates a file.
122 It defaults to None, in which case filename is opened to provide
123 a file object.
124
125 When fileobj is not None, the filename argument is only used to be
126 included in the gzip file header, which may includes the original
127 filename of the uncompressed file. It defaults to the filename of
128 fileobj, if discernible; otherwise, it defaults to the empty string,
129 and in this case the original filename is not included in the header.
130
131 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
132 depending on whether the file will be read or written. The default
133 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
134 Be aware that only the 'rb', 'ab', and 'wb' values should be used
135 for cross-platform portability.
136
137 The compresslevel argument is an integer from 1 to 9 controlling the
138 level of compression; 1 is fastest and produces the least compression,
139 and 9 is slowest and produces the most compression. The default is 9.
140
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000141 The mtime argument is an optional numeric timestamp to be written
142 to the stream when compressing. All gzip compressed streams
143 are required to contain a timestamp. If omitted or None, the
144 current time is used. This module ignores the timestamp when
145 decompressing; however, some programs, such as gunzip, make use
146 of it. The format of the timestamp is the same as that of the
147 return value of time.time() and of the st_mtime member of the
148 object returned by os.stat().
149
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000150 """
151
Skip Montanaro12424bc2002-05-23 01:43:05 +0000152 # guarantee the file is opened in binary mode on platforms
153 # that care about that sort of thing
154 if mode and 'b' not in mode:
155 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000156 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000157 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000158 if filename is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 if hasattr(fileobj, 'name'): filename = fileobj.name
160 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000161 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000162 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000163 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000164
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000165 if mode[0:1] == 'r':
166 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000167 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000168 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000169 # Buffer data read from gzip file. extrastart is offset in
170 # stream where buffer starts. extrasize is number of
171 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000172 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000173 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000174 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000175 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000176 # Starts small, scales exponentially
177 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000178 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000179
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000180 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000181 self.mode = WRITE
182 self._init_write(filename)
183 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000184 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 -zlib.MAX_WBITS,
186 zlib.DEF_MEM_LEVEL,
187 0)
188 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000189 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000190
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000191 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000192 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000193 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000194
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 if self.mode == WRITE:
196 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000197
Thomas Wouterscf297e42007-02-23 15:07:44 +0000198 @property
199 def filename(self):
200 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000201 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000202 if self.mode == WRITE and self.name[-3:] != ".gz":
203 return self.name + ".gz"
204 return self.name
205
Guido van Rossum15262191997-04-30 16:04:57 +0000206 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000207 fileobj = self.fileobj
208 if isinstance(fileobj, _PaddedFile):
209 fileobj = fileobj.file
210 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000212
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000213 def _check_closed(self):
214 """Raises a ValueError if the underlying file object has been closed.
215
216 """
217 if self.closed:
218 raise ValueError('I/O operation on closed file.')
219
Guido van Rossum15262191997-04-30 16:04:57 +0000220 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000221 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000222 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000223 self.size = 0
224 self.writebuf = []
225 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000226
227 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000228 self.fileobj.write(b'\037\213') # magic header
229 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000230 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000231 # RFC 1952 requires the FNAME field to be Latin-1. Do not
232 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000233 fname = os.path.basename(self.name)
234 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000235 if fname.endswith(b'.gz'):
236 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000237 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000238 fname = b''
239 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000240 if fname:
241 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000242 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000243 mtime = self.mtime
244 if mtime is None:
245 mtime = time.time()
246 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000247 self.fileobj.write(b'\002')
248 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000249 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000250 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000251
252 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000253 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000255
256 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000257 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000258 if magic == b'':
259 raise EOFError("Reached EOF")
260
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000261 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000262 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000263 method = ord( self.fileobj.read(1) )
264 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000265 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000266 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000267 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000268 # extraflag = self.fileobj.read(1)
269 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000270 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000271
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000272 if flag & FEXTRA:
273 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000274 xlen = ord(self.fileobj.read(1))
275 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 self.fileobj.read(xlen)
277 if flag & FNAME:
278 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000279 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000280 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000281 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000282 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000283 if flag & FCOMMENT:
284 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000285 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000286 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000287 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000288 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000289 if flag & FHCRC:
290 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000291
Antoine Pitrou7b969842010-09-23 16:22:51 +0000292 unused = self.fileobj.unused()
293 if unused:
294 uncompress = self.decompress.decompress(unused)
295 self._add_read_data(uncompress)
296
Guido van Rossum15262191997-04-30 16:04:57 +0000297 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000298 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000299 if self.mode != WRITE:
300 import errno
301 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000302
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000304 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000305
306 # Convert data type if called by io.BufferedWriter.
307 if isinstance(data, memoryview):
308 data = data.tobytes()
309
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000310 if len(data) > 0:
311 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000312 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000313 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000314 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000315
Antoine Pitroub1f88352010-01-03 22:37:40 +0000316 return len(data)
317
Guido van Rossum56068012000-02-02 16:51:06 +0000318 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000319 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000320 if self.mode != READ:
321 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000322 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000323
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000324 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000325 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000326
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000327 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000328 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000329 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000330 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000331 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000332 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 except EOFError:
334 size = self.extrasize
335 else: # just get some more of it
336 try:
337 while size > self.extrasize:
338 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000339 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000340 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000341 if size > self.extrasize:
342 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000343
Antoine Pitroub1f88352010-01-03 22:37:40 +0000344 offset = self.offset - self.extrastart
345 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000346 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000347
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000348 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000349 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000350
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200351 def read1(self, size=-1):
352 self._check_closed()
353 if self.mode != READ:
354 import errno
355 raise IOError(errno.EBADF, "read1() on write-only GzipFile object")
356
357 if self.extrasize <= 0 and self.fileobj is None:
358 return b''
359
360 try:
361 self._read()
362 except EOFError:
363 pass
364 if size < 0 or size > self.extrasize:
365 size = self.extrasize
366
367 offset = self.offset - self.extrastart
368 chunk = self.extrabuf[offset: offset + size]
369 self.extrasize -= size
370 self.offset += size
371 return chunk
372
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000373 def peek(self, n):
374 if self.mode != READ:
375 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000376 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000377
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000378 # Do not return ridiculously small buffers, for one common idiom
379 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000380 if n < 100:
381 n = 100
382 if self.extrasize == 0:
383 if self.fileobj is None:
384 return b''
385 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000386 # 1024 is the same buffering heuristic used in read()
387 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000388 except EOFError:
389 pass
390 offset = self.offset - self.extrastart
391 remaining = self.extrasize
392 assert remaining == len(self.extrabuf) - offset
393 return self.extrabuf[offset:offset + n]
394
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000395 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000396 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000397 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000398
399 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000400 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000401 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000402
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000403 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000404 # If the _new_member flag is set, we have to
405 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000406 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000407 self._read_gzip_header()
408 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000409 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000410
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000411 # Read a chunk of data from the file
412 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000413
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000414 # If the EOF has been reached, flush the decompression object
415 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000416
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000417 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000418 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000419 # Prepend the already read bytes to the fileobj to they can be
420 # seen by _read_eof()
421 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000422 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000423 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000424 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000425
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000426 uncompress = self.decompress.decompress(buf)
427 self._add_read_data( uncompress )
428
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000429 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000430 # Ending case: we've come to the end of a member in the file,
431 # so seek back to the start of the unused data, finish up
432 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000433 # Prepend the already read bytes to the fileobj to they can be
434 # seen by _read_eof() and _read_gzip_header()
435 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000436 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000437 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000438 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000439 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000440
441 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000442 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000443 offset = self.offset - self.extrastart
444 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000445 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000446 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000447 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000448
449 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000450 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000451 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000452 # uncompressed data matches the stored values. Note that the size
453 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000454 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000455 isize = read32(self.fileobj) # may exceed 2GB
456 if crc32 != self.crc:
457 raise IOError("CRC check failed %s != %s" % (hex(crc32),
458 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000459 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000460 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000461
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000462 # Gzip files can be padded with zeroes and still have archives.
463 # Consume all zero bytes and set the file position to the first
464 # non-zero byte. See http://www.gzip.org/#faq8
465 c = b"\x00"
466 while c == b"\x00":
467 c = self.fileobj.read(1)
468 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000469 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000470
Antoine Pitroub1f88352010-01-03 22:37:40 +0000471 @property
472 def closed(self):
473 return self.fileobj is None
474
Guido van Rossum15262191997-04-30 16:04:57 +0000475 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000476 if self.fileobj is None:
477 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000478 if self.mode == WRITE:
479 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000480 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000481 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000482 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000483 self.fileobj = None
484 elif self.mode == READ:
485 self.fileobj = None
486 if self.myfileobj:
487 self.myfileobj.close()
488 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000489
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000490 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000491 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000492 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000493 # Ensure the compressor's buffer is flushed
494 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000495 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000496
Tim Peters5cfb05e2004-07-27 21:02:02 +0000497 def fileno(self):
498 """Invoke the underlying file object's fileno() method.
499
500 This will raise AttributeError if the underlying file object
501 doesn't support fileno().
502 """
503 return self.fileobj.fileno()
504
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000505 def rewind(self):
506 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000507 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000508 if self.mode != READ:
509 raise IOError("Can't rewind in write mode")
510 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000511 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000512 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000513 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000514 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000515 self.offset = 0
516
Antoine Pitroub1f88352010-01-03 22:37:40 +0000517 def readable(self):
518 return self.mode == READ
519
520 def writable(self):
521 return self.mode == WRITE
522
523 def seekable(self):
524 return True
525
Thomas Wouters89f507f2006-12-13 04:49:30 +0000526 def seek(self, offset, whence=0):
527 if whence:
528 if whence == 1:
529 offset = self.offset + offset
530 else:
531 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000532 if self.mode == WRITE:
533 if offset < self.offset:
534 raise IOError('Negative seek in write mode')
535 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000536 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000537 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000538 self.write(chunk)
539 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000540 elif self.mode == READ:
541 if offset < self.offset:
542 # for negative seek, rewind and do positive seek
543 self.rewind()
544 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000545 for i in range(count // 1024):
546 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000547 self.read(count % 1024)
548
Antoine Pitroub1f88352010-01-03 22:37:40 +0000549 return self.offset
550
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000551 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000553 # Shortcut common case - newline found in buffer.
554 offset = self.offset - self.extrastart
555 i = self.extrabuf.find(b'\n', offset) + 1
556 if i > 0:
557 self.extrasize -= i - offset
558 self.offset += i - offset
559 return self.extrabuf[offset: i]
560
Christian Heimesa37d4c62007-12-04 23:02:19 +0000561 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000562 readsize = self.min_readsize
563 else:
564 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000565 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000566 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000567 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000568 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569
570 # We set i=size to break out of the loop under two
571 # conditions: 1) there's no newline, and the chunk is
572 # larger than size, or 2) there is a newline, but the
573 # resulting line would be longer than 'size'.
574 if (size <= i) or (i == -1 and len(c) > size):
575 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000576
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000577 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000578 bufs.append(c[:i + 1]) # Add portion of last chunk
579 self._unread(c[i + 1:]) # Push back rest of chunk
580 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000581
582 # Append chunk to list, decrease 'size',
583 bufs.append(c)
584 size = size - len(c)
585 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000586 if readsize > self.min_readsize:
587 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000588 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000589
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000590
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000591def compress(data, compresslevel=9):
592 """Compress data in one shot and return the compressed string.
593 Optional argument is the compression level, in range of 1-9.
594 """
595 buf = io.BytesIO()
596 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
597 f.write(data)
598 return buf.getvalue()
599
600def decompress(data):
601 """Decompress a gzip compressed string in one shot.
602 Return the decompressed string.
603 """
604 with GzipFile(fileobj=io.BytesIO(data)) as f:
605 return f.read()
606
607
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000608def _test():
609 # Act like gzip; with -d, act like gunzip.
610 # The input file is not deleted, however, nor are any other gzip
611 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000612 args = sys.argv[1:]
613 decompress = args and args[0] == "-d"
614 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000615 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000616 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000617 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000618 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000619 if decompress:
620 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000621 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
622 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000623 else:
624 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000625 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000626 continue
627 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000628 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000629 else:
630 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000631 f = sys.stdin.buffer
632 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000633 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000634 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000635 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000636 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000637 chunk = f.read(1024)
638 if not chunk:
639 break
640 g.write(chunk)
641 if g is not sys.stdout:
642 g.close()
643 if f is not sys.stdin:
644 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000645
646if __name__ == '__main__':
647 _test()