blob: 2f53aa8aacfeb61d4c477a2472b2eea62b8dc2b1 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Nadeem Vawda7e126202012-05-06 15:04:01 +020019def open(filename, mode="rb", compresslevel=9,
20 encoding=None, errors=None, newline=None):
21 """Open a gzip-compressed file in binary or text mode.
22
23 The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode,
24 or "rt", "wt" or "at" for text mode. The default mode is "rb", and the
25 default compresslevel is 9.
26
27 For binary mode, this function is equivalent to the GzipFile constructor:
28 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
29 and newline arguments must not be provided.
30
31 For text mode, a GzipFile object is created, and wrapped in an
32 io.TextIOWrapper instance with the specified encoding, error handling
33 behavior, and line ending(s).
34
35 """
36 if "t" in mode:
37 if "b" in mode:
38 raise ValueError("Invalid mode: %r" % (mode,))
39 else:
40 if encoding is not None:
41 raise ValueError("Argument 'encoding' not supported in binary mode")
42 if errors is not None:
43 raise ValueError("Argument 'errors' not supported in binary mode")
44 if newline is not None:
45 raise ValueError("Argument 'newline' not supported in binary mode")
46 binary_file = GzipFile(filename, mode.replace("t", ""), compresslevel)
47 if "t" in mode:
48 return io.TextIOWrapper(binary_file, encoding, errors, newline)
49 else:
50 return binary_file
51
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000052def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000053 # The L format writes the bit pattern correctly whether signed
54 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000055 output.write(struct.pack("<L", value))
56
Guido van Rossum15262191997-04-30 16:04:57 +000057def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000058 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000059
Antoine Pitrou7b969842010-09-23 16:22:51 +000060class _PaddedFile:
61 """Minimal read-only file object that prepends a string to the contents
62 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
63 essential functionality."""
64
65 def __init__(self, f, prepend=b''):
66 self._buffer = prepend
67 self._length = len(prepend)
68 self.file = f
69 self._read = 0
70
71 def read(self, size):
72 if self._read is None:
73 return self.file.read(size)
74 if self._read + size <= self._length:
75 read = self._read
76 self._read += size
77 return self._buffer[read:self._read]
78 else:
79 read = self._read
80 self._read = None
81 return self._buffer[read:] + \
82 self.file.read(size-self._length+read)
83
84 def prepend(self, prepend=b'', readprevious=False):
85 if self._read is None:
86 self._buffer = prepend
87 elif readprevious and len(prepend) <= self._read:
88 self._read -= len(prepend)
89 return
90 else:
91 self._buffer = self._buffer[read:] + prepend
92 self._length = len(self._buffer)
93 self._read = 0
94
95 def unused(self):
96 if self._read is None:
97 return b''
98 return self._buffer[self._read:]
99
100 def seek(self, offset, whence=0):
101 # This is only ever called with offset=whence=0
102 if whence == 1 and self._read is not None:
103 if 0 <= offset + self._read <= self._length:
104 self._read += offset
105 return
106 else:
107 offset += self._length - self._read
108 self._read = None
109 self._buffer = None
110 return self.file.seek(offset, whence)
111
112 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +0000113 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000114
115
Antoine Pitroub1f88352010-01-03 22:37:40 +0000116class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000117 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000118 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000119
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200120 This class only supports opening files in binary mode. If you need to open a
121 compressed file in text mode, wrap your GzipFile with an io.TextIOWrapper.
122
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000123 """
Guido van Rossum15262191997-04-30 16:04:57 +0000124
Guido van Rossum68de3791997-07-19 20:22:23 +0000125 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000126 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000127
Tim Peters07e99cb2001-01-14 23:47:14 +0000128 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000129 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000130 """Constructor for the GzipFile class.
131
132 At least one of fileobj and filename must be given a
133 non-trivial value.
134
135 The new class instance is based on fileobj, which can be a regular
136 file, a StringIO object, or any other object which simulates a file.
137 It defaults to None, in which case filename is opened to provide
138 a file object.
139
140 When fileobj is not None, the filename argument is only used to be
141 included in the gzip file header, which may includes the original
142 filename of the uncompressed file. It defaults to the filename of
143 fileobj, if discernible; otherwise, it defaults to the empty string,
144 and in this case the original filename is not included in the header.
145
146 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
147 depending on whether the file will be read or written. The default
148 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200149 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
150 'wb', and 'a' and 'ab'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000151
152 The compresslevel argument is an integer from 1 to 9 controlling the
153 level of compression; 1 is fastest and produces the least compression,
154 and 9 is slowest and produces the most compression. The default is 9.
155
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000156 The mtime argument is an optional numeric timestamp to be written
157 to the stream when compressing. All gzip compressed streams
158 are required to contain a timestamp. If omitted or None, the
159 current time is used. This module ignores the timestamp when
160 decompressing; however, some programs, such as gunzip, make use
161 of it. The format of the timestamp is the same as that of the
162 return value of time.time() and of the st_mtime member of the
163 object returned by os.stat().
164
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000165 """
166
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200167 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200168 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000169 if mode and 'b' not in mode:
170 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000171 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000172 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000173 if filename is None:
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200174 if hasattr(fileobj, 'name') and isinstance(fileobj.name, str):
175 filename = fileobj.name
176 else:
177 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000178 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200179 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000180
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200181 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000182 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000183 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000184 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000185 # Buffer data read from gzip file. extrastart is offset in
186 # stream where buffer starts. extrasize is number of
187 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000188 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000189 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000190 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000191 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000192 # Starts small, scales exponentially
193 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000194 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000195
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200196 elif mode.startswith(('w', 'a')):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000197 self.mode = WRITE
198 self._init_write(filename)
199 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000200 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000201 -zlib.MAX_WBITS,
202 zlib.DEF_MEM_LEVEL,
203 0)
204 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200205 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000206
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000207 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000208 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000209 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000210
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 if self.mode == WRITE:
212 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000213
Thomas Wouterscf297e42007-02-23 15:07:44 +0000214 @property
215 def filename(self):
216 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000217 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000218 if self.mode == WRITE and self.name[-3:] != ".gz":
219 return self.name + ".gz"
220 return self.name
221
Guido van Rossum15262191997-04-30 16:04:57 +0000222 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000223 fileobj = self.fileobj
224 if isinstance(fileobj, _PaddedFile):
225 fileobj = fileobj.file
226 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000227 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000228
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000229 def _check_closed(self):
230 """Raises a ValueError if the underlying file object has been closed.
231
232 """
233 if self.closed:
234 raise ValueError('I/O operation on closed file.')
235
Guido van Rossum15262191997-04-30 16:04:57 +0000236 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000237 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000238 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000239 self.size = 0
240 self.writebuf = []
241 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000242
243 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000244 self.fileobj.write(b'\037\213') # magic header
245 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000246 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000247 # RFC 1952 requires the FNAME field to be Latin-1. Do not
248 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000249 fname = os.path.basename(self.name)
250 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000251 if fname.endswith(b'.gz'):
252 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000253 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000254 fname = b''
255 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 if fname:
257 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000258 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000259 mtime = self.mtime
260 if mtime is None:
261 mtime = time.time()
262 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000263 self.fileobj.write(b'\002')
264 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000265 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000266 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000267
268 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000269 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000270 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000271
272 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000274 if magic == b'':
275 raise EOFError("Reached EOF")
276
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000277 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000278 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000279 method = ord( self.fileobj.read(1) )
280 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000281 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000282 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000283 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000284 # extraflag = self.fileobj.read(1)
285 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000286 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000287
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000288 if flag & FEXTRA:
289 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000290 xlen = ord(self.fileobj.read(1))
291 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000292 self.fileobj.read(xlen)
293 if flag & FNAME:
294 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000295 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000296 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000297 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000298 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000299 if flag & FCOMMENT:
300 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000301 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000302 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000303 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000304 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000305 if flag & FHCRC:
306 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000307
Antoine Pitrou7b969842010-09-23 16:22:51 +0000308 unused = self.fileobj.unused()
309 if unused:
310 uncompress = self.decompress.decompress(unused)
311 self._add_read_data(uncompress)
312
Guido van Rossum15262191997-04-30 16:04:57 +0000313 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000314 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000315 if self.mode != WRITE:
316 import errno
317 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000318
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000320 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000321
322 # Convert data type if called by io.BufferedWriter.
323 if isinstance(data, memoryview):
324 data = data.tobytes()
325
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000326 if len(data) > 0:
327 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000328 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000329 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000330 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000331
Antoine Pitroub1f88352010-01-03 22:37:40 +0000332 return len(data)
333
Guido van Rossum56068012000-02-02 16:51:06 +0000334 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000335 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000336 if self.mode != READ:
337 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000338 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000339
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000340 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000341 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000342
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000343 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000344 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000345 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000346 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000347 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000348 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000349 except EOFError:
350 size = self.extrasize
351 else: # just get some more of it
352 try:
353 while size > self.extrasize:
354 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000355 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000356 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000357 if size > self.extrasize:
358 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000359
Antoine Pitroub1f88352010-01-03 22:37:40 +0000360 offset = self.offset - self.extrastart
361 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000362 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000363
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000364 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000365 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000366
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200367 def read1(self, size=-1):
368 self._check_closed()
369 if self.mode != READ:
370 import errno
371 raise IOError(errno.EBADF, "read1() on write-only GzipFile object")
372
373 if self.extrasize <= 0 and self.fileobj is None:
374 return b''
375
376 try:
377 self._read()
378 except EOFError:
379 pass
380 if size < 0 or size > self.extrasize:
381 size = self.extrasize
382
383 offset = self.offset - self.extrastart
384 chunk = self.extrabuf[offset: offset + size]
385 self.extrasize -= size
386 self.offset += size
387 return chunk
388
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000389 def peek(self, n):
390 if self.mode != READ:
391 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000392 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000393
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000394 # Do not return ridiculously small buffers, for one common idiom
395 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000396 if n < 100:
397 n = 100
398 if self.extrasize == 0:
399 if self.fileobj is None:
400 return b''
401 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000402 # 1024 is the same buffering heuristic used in read()
403 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000404 except EOFError:
405 pass
406 offset = self.offset - self.extrastart
407 remaining = self.extrasize
408 assert remaining == len(self.extrabuf) - offset
409 return self.extrabuf[offset:offset + n]
410
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000411 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000412 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000413 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000414
415 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000416 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000417 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000418
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000419 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000420 # If the _new_member flag is set, we have to
421 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000422 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000423 self._read_gzip_header()
424 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000425 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000426
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000427 # Read a chunk of data from the file
428 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000429
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000430 # If the EOF has been reached, flush the decompression object
431 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000432
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000433 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000434 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000435 # Prepend the already read bytes to the fileobj to they can be
436 # seen by _read_eof()
437 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000438 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000439 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000440 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000441
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000442 uncompress = self.decompress.decompress(buf)
443 self._add_read_data( uncompress )
444
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000445 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000446 # Ending case: we've come to the end of a member in the file,
447 # so seek back to the start of the unused data, finish up
448 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000449 # Prepend the already read bytes to the fileobj to they can be
450 # seen by _read_eof() and _read_gzip_header()
451 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000452 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000453 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000454 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000455 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000456
457 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000458 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000459 offset = self.offset - self.extrastart
460 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000461 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000462 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000463 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000464
465 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000466 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000467 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000468 # uncompressed data matches the stored values. Note that the size
469 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000470 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000471 isize = read32(self.fileobj) # may exceed 2GB
472 if crc32 != self.crc:
473 raise IOError("CRC check failed %s != %s" % (hex(crc32),
474 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000475 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000476 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000477
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000478 # Gzip files can be padded with zeroes and still have archives.
479 # Consume all zero bytes and set the file position to the first
480 # non-zero byte. See http://www.gzip.org/#faq8
481 c = b"\x00"
482 while c == b"\x00":
483 c = self.fileobj.read(1)
484 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000485 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000486
Antoine Pitroub1f88352010-01-03 22:37:40 +0000487 @property
488 def closed(self):
489 return self.fileobj is None
490
Guido van Rossum15262191997-04-30 16:04:57 +0000491 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000492 if self.fileobj is None:
493 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000494 if self.mode == WRITE:
495 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000496 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000497 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000498 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000499 self.fileobj = None
500 elif self.mode == READ:
501 self.fileobj = None
502 if self.myfileobj:
503 self.myfileobj.close()
504 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000505
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000506 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000507 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000508 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000509 # Ensure the compressor's buffer is flushed
510 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000511 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000512
Tim Peters5cfb05e2004-07-27 21:02:02 +0000513 def fileno(self):
514 """Invoke the underlying file object's fileno() method.
515
516 This will raise AttributeError if the underlying file object
517 doesn't support fileno().
518 """
519 return self.fileobj.fileno()
520
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000521 def rewind(self):
522 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000523 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000524 if self.mode != READ:
525 raise IOError("Can't rewind in write mode")
526 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000527 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000528 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000529 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000530 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000531 self.offset = 0
532
Antoine Pitroub1f88352010-01-03 22:37:40 +0000533 def readable(self):
534 return self.mode == READ
535
536 def writable(self):
537 return self.mode == WRITE
538
539 def seekable(self):
540 return True
541
Thomas Wouters89f507f2006-12-13 04:49:30 +0000542 def seek(self, offset, whence=0):
543 if whence:
544 if whence == 1:
545 offset = self.offset + offset
546 else:
547 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000548 if self.mode == WRITE:
549 if offset < self.offset:
550 raise IOError('Negative seek in write mode')
551 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000552 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000553 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000554 self.write(chunk)
555 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000556 elif self.mode == READ:
557 if offset < self.offset:
558 # for negative seek, rewind and do positive seek
559 self.rewind()
560 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000561 for i in range(count // 1024):
562 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000563 self.read(count % 1024)
564
Antoine Pitroub1f88352010-01-03 22:37:40 +0000565 return self.offset
566
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000567 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000568 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000569 # Shortcut common case - newline found in buffer.
570 offset = self.offset - self.extrastart
571 i = self.extrabuf.find(b'\n', offset) + 1
572 if i > 0:
573 self.extrasize -= i - offset
574 self.offset += i - offset
575 return self.extrabuf[offset: i]
576
Christian Heimesa37d4c62007-12-04 23:02:19 +0000577 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000578 readsize = self.min_readsize
579 else:
580 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000581 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000582 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000583 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000584 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585
586 # We set i=size to break out of the loop under two
587 # conditions: 1) there's no newline, and the chunk is
588 # larger than size, or 2) there is a newline, but the
589 # resulting line would be longer than 'size'.
590 if (size <= i) or (i == -1 and len(c) > size):
591 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000592
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000593 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000594 bufs.append(c[:i + 1]) # Add portion of last chunk
595 self._unread(c[i + 1:]) # Push back rest of chunk
596 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000597
598 # Append chunk to list, decrease 'size',
599 bufs.append(c)
600 size = size - len(c)
601 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000602 if readsize > self.min_readsize:
603 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000604 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000605
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000606
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000607def compress(data, compresslevel=9):
608 """Compress data in one shot and return the compressed string.
609 Optional argument is the compression level, in range of 1-9.
610 """
611 buf = io.BytesIO()
612 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
613 f.write(data)
614 return buf.getvalue()
615
616def decompress(data):
617 """Decompress a gzip compressed string in one shot.
618 Return the decompressed string.
619 """
620 with GzipFile(fileobj=io.BytesIO(data)) as f:
621 return f.read()
622
623
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000624def _test():
625 # Act like gzip; with -d, act like gunzip.
626 # The input file is not deleted, however, nor are any other gzip
627 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000628 args = sys.argv[1:]
629 decompress = args and args[0] == "-d"
630 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000631 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000632 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000633 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000634 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000635 if decompress:
636 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000637 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
638 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000639 else:
640 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000641 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000642 continue
643 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000644 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000645 else:
646 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000647 f = sys.stdin.buffer
648 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000649 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000650 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000651 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000652 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000653 chunk = f.read(1024)
654 if not chunk:
655 break
656 g.write(chunk)
657 if g is not sys.stdout:
658 g.close()
659 if f is not sys.stdin:
660 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000661
662if __name__ == '__main__':
663 _test()