blob: a5bfb85ee11a56ac293c48faa35465671b894773 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Guido van Rossum15262191997-04-30 16:04:57 +000024def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000025 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000026
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou7b969842010-09-23 16:22:51 +000036class _PaddedFile:
37 """Minimal read-only file object that prepends a string to the contents
38 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
39 essential functionality."""
40
41 def __init__(self, f, prepend=b''):
42 self._buffer = prepend
43 self._length = len(prepend)
44 self.file = f
45 self._read = 0
46
47 def read(self, size):
48 if self._read is None:
49 return self.file.read(size)
50 if self._read + size <= self._length:
51 read = self._read
52 self._read += size
53 return self._buffer[read:self._read]
54 else:
55 read = self._read
56 self._read = None
57 return self._buffer[read:] + \
58 self.file.read(size-self._length+read)
59
60 def prepend(self, prepend=b'', readprevious=False):
61 if self._read is None:
62 self._buffer = prepend
63 elif readprevious and len(prepend) <= self._read:
64 self._read -= len(prepend)
65 return
66 else:
67 self._buffer = self._buffer[read:] + prepend
68 self._length = len(self._buffer)
69 self._read = 0
70
71 def unused(self):
72 if self._read is None:
73 return b''
74 return self._buffer[self._read:]
75
76 def seek(self, offset, whence=0):
77 # This is only ever called with offset=whence=0
78 if whence == 1 and self._read is not None:
79 if 0 <= offset + self._read <= self._length:
80 self._read += offset
81 return
82 else:
83 offset += self._length - self._read
84 self._read = None
85 self._buffer = None
86 return self.file.seek(offset, whence)
87
88 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +000089 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +000090
91
Antoine Pitroub1f88352010-01-03 22:37:40 +000092class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000093 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000094 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000095
Nadeem Vawda30d94b72012-02-11 23:45:10 +020096 This class only supports opening files in binary mode. If you need to open a
97 compressed file in text mode, wrap your GzipFile with an io.TextIOWrapper.
98
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000099 """
Guido van Rossum15262191997-04-30 16:04:57 +0000100
Guido van Rossum68de3791997-07-19 20:22:23 +0000101 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000102 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000103
Tim Peters07e99cb2001-01-14 23:47:14 +0000104 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000105 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000106 """Constructor for the GzipFile class.
107
108 At least one of fileobj and filename must be given a
109 non-trivial value.
110
111 The new class instance is based on fileobj, which can be a regular
112 file, a StringIO object, or any other object which simulates a file.
113 It defaults to None, in which case filename is opened to provide
114 a file object.
115
116 When fileobj is not None, the filename argument is only used to be
117 included in the gzip file header, which may includes the original
118 filename of the uncompressed file. It defaults to the filename of
119 fileobj, if discernible; otherwise, it defaults to the empty string,
120 and in this case the original filename is not included in the header.
121
122 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
123 depending on whether the file will be read or written. The default
124 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200125 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
126 'wb', and 'a' and 'ab'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000127
128 The compresslevel argument is an integer from 1 to 9 controlling the
129 level of compression; 1 is fastest and produces the least compression,
130 and 9 is slowest and produces the most compression. The default is 9.
131
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000132 The mtime argument is an optional numeric timestamp to be written
133 to the stream when compressing. All gzip compressed streams
134 are required to contain a timestamp. If omitted or None, the
135 current time is used. This module ignores the timestamp when
136 decompressing; however, some programs, such as gunzip, make use
137 of it. The format of the timestamp is the same as that of the
138 return value of time.time() and of the st_mtime member of the
139 object returned by os.stat().
140
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000141 """
142
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200143 if mode and ('t' in mode or 'U' in mode):
144 raise IOError("Mode " + mode + " not supported")
Skip Montanaro12424bc2002-05-23 01:43:05 +0000145 if mode and 'b' not in mode:
146 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000147 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000148 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000149 if filename is None:
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200150 if hasattr(fileobj, 'name') and isinstance(fileobj.name, str):
151 filename = fileobj.name
152 else:
153 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000154 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000155 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000156 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000157
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000158 if mode[0:1] == 'r':
159 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000160 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000161 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000162 # Buffer data read from gzip file. extrastart is offset in
163 # stream where buffer starts. extrasize is number of
164 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000165 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000166 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000167 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000168 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000169 # Starts small, scales exponentially
170 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000171 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000172
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000173 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000174 self.mode = WRITE
175 self._init_write(filename)
176 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000177 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000178 -zlib.MAX_WBITS,
179 zlib.DEF_MEM_LEVEL,
180 0)
181 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000182 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000183
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000184 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000185 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000186 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000187
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000188 if self.mode == WRITE:
189 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000190
Thomas Wouterscf297e42007-02-23 15:07:44 +0000191 @property
192 def filename(self):
193 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000194 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000195 if self.mode == WRITE and self.name[-3:] != ".gz":
196 return self.name + ".gz"
197 return self.name
198
Guido van Rossum15262191997-04-30 16:04:57 +0000199 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000200 fileobj = self.fileobj
201 if isinstance(fileobj, _PaddedFile):
202 fileobj = fileobj.file
203 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000204 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000205
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000206 def _check_closed(self):
207 """Raises a ValueError if the underlying file object has been closed.
208
209 """
210 if self.closed:
211 raise ValueError('I/O operation on closed file.')
212
Guido van Rossum15262191997-04-30 16:04:57 +0000213 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000214 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000215 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000216 self.size = 0
217 self.writebuf = []
218 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000219
220 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000221 self.fileobj.write(b'\037\213') # magic header
222 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000223 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000224 # RFC 1952 requires the FNAME field to be Latin-1. Do not
225 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000226 fname = os.path.basename(self.name)
227 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000228 if fname.endswith(b'.gz'):
229 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000230 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000231 fname = b''
232 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000233 if fname:
234 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000235 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000236 mtime = self.mtime
237 if mtime is None:
238 mtime = time.time()
239 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000240 self.fileobj.write(b'\002')
241 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000242 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000243 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000244
245 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000246 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000247 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000248
249 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000250 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000251 if magic == b'':
252 raise EOFError("Reached EOF")
253
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000254 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000255 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 method = ord( self.fileobj.read(1) )
257 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000258 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000259 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000260 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000261 # extraflag = self.fileobj.read(1)
262 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000263 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000264
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000265 if flag & FEXTRA:
266 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000267 xlen = ord(self.fileobj.read(1))
268 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000269 self.fileobj.read(xlen)
270 if flag & FNAME:
271 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000272 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000273 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000274 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000275 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 if flag & FCOMMENT:
277 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000278 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000279 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000280 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000281 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000282 if flag & FHCRC:
283 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000284
Antoine Pitrou7b969842010-09-23 16:22:51 +0000285 unused = self.fileobj.unused()
286 if unused:
287 uncompress = self.decompress.decompress(unused)
288 self._add_read_data(uncompress)
289
Guido van Rossum15262191997-04-30 16:04:57 +0000290 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000291 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000292 if self.mode != WRITE:
293 import errno
294 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000295
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000296 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000297 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000298
299 # Convert data type if called by io.BufferedWriter.
300 if isinstance(data, memoryview):
301 data = data.tobytes()
302
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 if len(data) > 0:
304 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000305 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000306 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000307 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000308
Antoine Pitroub1f88352010-01-03 22:37:40 +0000309 return len(data)
310
Guido van Rossum56068012000-02-02 16:51:06 +0000311 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000312 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000313 if self.mode != READ:
314 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000315 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000316
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000317 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000318 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000319
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000320 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000321 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000322 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000323 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000324 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000325 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000326 except EOFError:
327 size = self.extrasize
328 else: # just get some more of it
329 try:
330 while size > self.extrasize:
331 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000332 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000334 if size > self.extrasize:
335 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000336
Antoine Pitroub1f88352010-01-03 22:37:40 +0000337 offset = self.offset - self.extrastart
338 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000339 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000340
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000341 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000342 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000343
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200344 def read1(self, size=-1):
345 self._check_closed()
346 if self.mode != READ:
347 import errno
348 raise IOError(errno.EBADF, "read1() on write-only GzipFile object")
349
350 if self.extrasize <= 0 and self.fileobj is None:
351 return b''
352
353 try:
354 self._read()
355 except EOFError:
356 pass
357 if size < 0 or size > self.extrasize:
358 size = self.extrasize
359
360 offset = self.offset - self.extrastart
361 chunk = self.extrabuf[offset: offset + size]
362 self.extrasize -= size
363 self.offset += size
364 return chunk
365
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000366 def peek(self, n):
367 if self.mode != READ:
368 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000369 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000370
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000371 # Do not return ridiculously small buffers, for one common idiom
372 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000373 if n < 100:
374 n = 100
375 if self.extrasize == 0:
376 if self.fileobj is None:
377 return b''
378 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000379 # 1024 is the same buffering heuristic used in read()
380 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000381 except EOFError:
382 pass
383 offset = self.offset - self.extrastart
384 remaining = self.extrasize
385 assert remaining == len(self.extrabuf) - offset
386 return self.extrabuf[offset:offset + n]
387
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000388 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000389 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000390 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000391
392 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000393 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000394 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000395
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000396 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000397 # If the _new_member flag is set, we have to
398 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000399 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000400 self._read_gzip_header()
401 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000402 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000403
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000404 # Read a chunk of data from the file
405 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000406
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000407 # If the EOF has been reached, flush the decompression object
408 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000409
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000410 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000411 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000412 # Prepend the already read bytes to the fileobj to they can be
413 # seen by _read_eof()
414 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000415 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000416 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000417 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000418
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000419 uncompress = self.decompress.decompress(buf)
420 self._add_read_data( uncompress )
421
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000422 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000423 # Ending case: we've come to the end of a member in the file,
424 # so seek back to the start of the unused data, finish up
425 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000426 # Prepend the already read bytes to the fileobj to they can be
427 # seen by _read_eof() and _read_gzip_header()
428 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000429 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000430 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000431 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000432 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000433
434 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000435 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000436 offset = self.offset - self.extrastart
437 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000438 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000439 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000440 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000441
442 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000443 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000444 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000445 # uncompressed data matches the stored values. Note that the size
446 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000447 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000448 isize = read32(self.fileobj) # may exceed 2GB
449 if crc32 != self.crc:
450 raise IOError("CRC check failed %s != %s" % (hex(crc32),
451 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000452 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000453 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000454
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000455 # Gzip files can be padded with zeroes and still have archives.
456 # Consume all zero bytes and set the file position to the first
457 # non-zero byte. See http://www.gzip.org/#faq8
458 c = b"\x00"
459 while c == b"\x00":
460 c = self.fileobj.read(1)
461 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000462 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000463
Antoine Pitroub1f88352010-01-03 22:37:40 +0000464 @property
465 def closed(self):
466 return self.fileobj is None
467
Guido van Rossum15262191997-04-30 16:04:57 +0000468 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000469 if self.fileobj is None:
470 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000471 if self.mode == WRITE:
472 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000473 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000474 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000475 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000476 self.fileobj = None
477 elif self.mode == READ:
478 self.fileobj = None
479 if self.myfileobj:
480 self.myfileobj.close()
481 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000482
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000483 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000484 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000485 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000486 # Ensure the compressor's buffer is flushed
487 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000488 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000489
Tim Peters5cfb05e2004-07-27 21:02:02 +0000490 def fileno(self):
491 """Invoke the underlying file object's fileno() method.
492
493 This will raise AttributeError if the underlying file object
494 doesn't support fileno().
495 """
496 return self.fileobj.fileno()
497
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000498 def rewind(self):
499 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000500 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000501 if self.mode != READ:
502 raise IOError("Can't rewind in write mode")
503 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000504 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000505 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000506 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000507 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000508 self.offset = 0
509
Antoine Pitroub1f88352010-01-03 22:37:40 +0000510 def readable(self):
511 return self.mode == READ
512
513 def writable(self):
514 return self.mode == WRITE
515
516 def seekable(self):
517 return True
518
Thomas Wouters89f507f2006-12-13 04:49:30 +0000519 def seek(self, offset, whence=0):
520 if whence:
521 if whence == 1:
522 offset = self.offset + offset
523 else:
524 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000525 if self.mode == WRITE:
526 if offset < self.offset:
527 raise IOError('Negative seek in write mode')
528 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000529 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000530 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000531 self.write(chunk)
532 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000533 elif self.mode == READ:
534 if offset < self.offset:
535 # for negative seek, rewind and do positive seek
536 self.rewind()
537 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000538 for i in range(count // 1024):
539 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000540 self.read(count % 1024)
541
Antoine Pitroub1f88352010-01-03 22:37:40 +0000542 return self.offset
543
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000544 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000546 # Shortcut common case - newline found in buffer.
547 offset = self.offset - self.extrastart
548 i = self.extrabuf.find(b'\n', offset) + 1
549 if i > 0:
550 self.extrasize -= i - offset
551 self.offset += i - offset
552 return self.extrabuf[offset: i]
553
Christian Heimesa37d4c62007-12-04 23:02:19 +0000554 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555 readsize = self.min_readsize
556 else:
557 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000558 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000559 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000560 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000561 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000562
563 # We set i=size to break out of the loop under two
564 # conditions: 1) there's no newline, and the chunk is
565 # larger than size, or 2) there is a newline, but the
566 # resulting line would be longer than 'size'.
567 if (size <= i) or (i == -1 and len(c) > size):
568 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000569
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000570 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000571 bufs.append(c[:i + 1]) # Add portion of last chunk
572 self._unread(c[i + 1:]) # Push back rest of chunk
573 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000574
575 # Append chunk to list, decrease 'size',
576 bufs.append(c)
577 size = size - len(c)
578 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000579 if readsize > self.min_readsize:
580 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000581 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000582
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000583
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000584def compress(data, compresslevel=9):
585 """Compress data in one shot and return the compressed string.
586 Optional argument is the compression level, in range of 1-9.
587 """
588 buf = io.BytesIO()
589 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
590 f.write(data)
591 return buf.getvalue()
592
593def decompress(data):
594 """Decompress a gzip compressed string in one shot.
595 Return the decompressed string.
596 """
597 with GzipFile(fileobj=io.BytesIO(data)) as f:
598 return f.read()
599
600
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000601def _test():
602 # Act like gzip; with -d, act like gunzip.
603 # The input file is not deleted, however, nor are any other gzip
604 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000605 args = sys.argv[1:]
606 decompress = args and args[0] == "-d"
607 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000608 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000609 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000610 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000611 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000612 if decompress:
613 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000614 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
615 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000616 else:
617 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000618 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000619 continue
620 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000621 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000622 else:
623 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000624 f = sys.stdin.buffer
625 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000626 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000627 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000628 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000629 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000630 chunk = f.read(1024)
631 if not chunk:
632 break
633 g.write(chunk)
634 if g is not sys.stdout:
635 g.close()
636 if f is not sys.stdin:
637 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000638
639if __name__ == '__main__':
640 _test()