blob: 698f0c2747ebdc69542febf00ab37051bc7e84bb [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Nadeem Vawda7e126202012-05-06 15:04:01 +020019def open(filename, mode="rb", compresslevel=9,
20 encoding=None, errors=None, newline=None):
21 """Open a gzip-compressed file in binary or text mode.
22
Nadeem Vawda68721012012-06-04 23:21:38 +020023 The filename argument can be an actual filename (a str or bytes object), or
24 an existing file object to read from or write to.
25
Nadeem Vawda7e126202012-05-06 15:04:01 +020026 The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode,
27 or "rt", "wt" or "at" for text mode. The default mode is "rb", and the
28 default compresslevel is 9.
29
30 For binary mode, this function is equivalent to the GzipFile constructor:
31 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
32 and newline arguments must not be provided.
33
34 For text mode, a GzipFile object is created, and wrapped in an
35 io.TextIOWrapper instance with the specified encoding, error handling
36 behavior, and line ending(s).
37
38 """
39 if "t" in mode:
40 if "b" in mode:
41 raise ValueError("Invalid mode: %r" % (mode,))
42 else:
43 if encoding is not None:
44 raise ValueError("Argument 'encoding' not supported in binary mode")
45 if errors is not None:
46 raise ValueError("Argument 'errors' not supported in binary mode")
47 if newline is not None:
48 raise ValueError("Argument 'newline' not supported in binary mode")
Nadeem Vawda68721012012-06-04 23:21:38 +020049
50 gz_mode = mode.replace("t", "")
51 if isinstance(filename, (str, bytes)):
52 binary_file = GzipFile(filename, gz_mode, compresslevel)
53 elif hasattr(filename, "read") or hasattr(filename, "write"):
54 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
55 else:
56 raise TypeError("filename must be a str or bytes object, or a file")
57
Nadeem Vawda7e126202012-05-06 15:04:01 +020058 if "t" in mode:
59 return io.TextIOWrapper(binary_file, encoding, errors, newline)
60 else:
61 return binary_file
62
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000063def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000064 # The L format writes the bit pattern correctly whether signed
65 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000066 output.write(struct.pack("<L", value))
67
Guido van Rossum15262191997-04-30 16:04:57 +000068def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000069 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000070
Antoine Pitrou7b969842010-09-23 16:22:51 +000071class _PaddedFile:
72 """Minimal read-only file object that prepends a string to the contents
73 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
74 essential functionality."""
75
76 def __init__(self, f, prepend=b''):
77 self._buffer = prepend
78 self._length = len(prepend)
79 self.file = f
80 self._read = 0
81
82 def read(self, size):
83 if self._read is None:
84 return self.file.read(size)
85 if self._read + size <= self._length:
86 read = self._read
87 self._read += size
88 return self._buffer[read:self._read]
89 else:
90 read = self._read
91 self._read = None
92 return self._buffer[read:] + \
93 self.file.read(size-self._length+read)
94
95 def prepend(self, prepend=b'', readprevious=False):
96 if self._read is None:
97 self._buffer = prepend
98 elif readprevious and len(prepend) <= self._read:
99 self._read -= len(prepend)
100 return
101 else:
102 self._buffer = self._buffer[read:] + prepend
103 self._length = len(self._buffer)
104 self._read = 0
105
106 def unused(self):
107 if self._read is None:
108 return b''
109 return self._buffer[self._read:]
110
111 def seek(self, offset, whence=0):
112 # This is only ever called with offset=whence=0
113 if whence == 1 and self._read is not None:
114 if 0 <= offset + self._read <= self._length:
115 self._read += offset
116 return
117 else:
118 offset += self._length - self._read
119 self._read = None
120 self._buffer = None
121 return self.file.seek(offset, whence)
122
123 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +0000124 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000125
126
Antoine Pitroub1f88352010-01-03 22:37:40 +0000127class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000128 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000129 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000130
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200131 This class only supports opening files in binary mode. If you need to open a
Nadeem Vawda83a4dd32012-06-30 13:34:28 +0200132 compressed file in text mode, use the gzip.open() function.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200133
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000134 """
Guido van Rossum15262191997-04-30 16:04:57 +0000135
Guido van Rossum68de3791997-07-19 20:22:23 +0000136 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000137 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000138
Tim Peters07e99cb2001-01-14 23:47:14 +0000139 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000140 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000141 """Constructor for the GzipFile class.
142
143 At least one of fileobj and filename must be given a
144 non-trivial value.
145
146 The new class instance is based on fileobj, which can be a regular
147 file, a StringIO object, or any other object which simulates a file.
148 It defaults to None, in which case filename is opened to provide
149 a file object.
150
151 When fileobj is not None, the filename argument is only used to be
152 included in the gzip file header, which may includes the original
153 filename of the uncompressed file. It defaults to the filename of
154 fileobj, if discernible; otherwise, it defaults to the empty string,
155 and in this case the original filename is not included in the header.
156
157 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
158 depending on whether the file will be read or written. The default
159 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200160 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
161 'wb', and 'a' and 'ab'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000162
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100163 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000164 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100165 and 9 is slowest and produces the most compression. 0 is no compression
166 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000167
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000168 The mtime argument is an optional numeric timestamp to be written
169 to the stream when compressing. All gzip compressed streams
170 are required to contain a timestamp. If omitted or None, the
171 current time is used. This module ignores the timestamp when
172 decompressing; however, some programs, such as gunzip, make use
173 of it. The format of the timestamp is the same as that of the
174 return value of time.time() and of the st_mtime member of the
175 object returned by os.stat().
176
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000177 """
178
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200179 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200180 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000181 if mode and 'b' not in mode:
182 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000183 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000184 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000185 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200186 filename = getattr(fileobj, 'name', '')
187 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200188 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000189 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200190 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000191
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200192 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000193 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000194 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000195 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000196 # Buffer data read from gzip file. extrastart is offset in
197 # stream where buffer starts. extrasize is number of
198 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000199 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000200 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000201 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000202 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000203 # Starts small, scales exponentially
204 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000205 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000206
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200207 elif mode.startswith(('w', 'a')):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000208 self.mode = WRITE
209 self._init_write(filename)
210 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000211 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000212 -zlib.MAX_WBITS,
213 zlib.DEF_MEM_LEVEL,
214 0)
215 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200216 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000217
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000218 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000219 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000220 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000221
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000222 if self.mode == WRITE:
223 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000224
Thomas Wouterscf297e42007-02-23 15:07:44 +0000225 @property
226 def filename(self):
227 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000228 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000229 if self.mode == WRITE and self.name[-3:] != ".gz":
230 return self.name + ".gz"
231 return self.name
232
Guido van Rossum15262191997-04-30 16:04:57 +0000233 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000234 fileobj = self.fileobj
235 if isinstance(fileobj, _PaddedFile):
236 fileobj = fileobj.file
237 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000238 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000239
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000240 def _check_closed(self):
241 """Raises a ValueError if the underlying file object has been closed.
242
243 """
244 if self.closed:
245 raise ValueError('I/O operation on closed file.')
246
Guido van Rossum15262191997-04-30 16:04:57 +0000247 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000248 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000249 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000250 self.size = 0
251 self.writebuf = []
252 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000253
254 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000255 self.fileobj.write(b'\037\213') # magic header
256 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000257 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000258 # RFC 1952 requires the FNAME field to be Latin-1. Do not
259 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000260 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200261 if not isinstance(fname, bytes):
262 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000263 if fname.endswith(b'.gz'):
264 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000265 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000266 fname = b''
267 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000268 if fname:
269 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000270 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000271 mtime = self.mtime
272 if mtime is None:
273 mtime = time.time()
274 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000275 self.fileobj.write(b'\002')
276 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000277 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000278 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000279
280 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000281 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000282 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000283
284 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000285 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000286 if magic == b'':
287 raise EOFError("Reached EOF")
288
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000289 if magic != b'\037\213':
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200290 raise OSError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000291 method = ord( self.fileobj.read(1) )
292 if method != 8:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200293 raise OSError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000294 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000295 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000296 # extraflag = self.fileobj.read(1)
297 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000298 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000299
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000300 if flag & FEXTRA:
301 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000302 xlen = ord(self.fileobj.read(1))
303 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000304 self.fileobj.read(xlen)
305 if flag & FNAME:
306 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000307 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000308 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000309 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000310 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000311 if flag & FCOMMENT:
312 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000313 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000314 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000315 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000316 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000317 if flag & FHCRC:
318 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000319
Antoine Pitrou7b969842010-09-23 16:22:51 +0000320 unused = self.fileobj.unused()
321 if unused:
322 uncompress = self.decompress.decompress(unused)
323 self._add_read_data(uncompress)
324
Guido van Rossum15262191997-04-30 16:04:57 +0000325 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000326 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000327 if self.mode != WRITE:
328 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200329 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000330
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000331 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000332 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000333
334 # Convert data type if called by io.BufferedWriter.
335 if isinstance(data, memoryview):
336 data = data.tobytes()
337
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000338 if len(data) > 0:
339 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000340 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000341 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000342 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000343
Antoine Pitroub1f88352010-01-03 22:37:40 +0000344 return len(data)
345
Guido van Rossum56068012000-02-02 16:51:06 +0000346 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000347 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000348 if self.mode != READ:
349 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200350 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000351
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000352 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000353 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000354
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000355 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000356 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000357 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000358 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000359 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000360 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000361 except EOFError:
362 size = self.extrasize
363 else: # just get some more of it
364 try:
365 while size > self.extrasize:
366 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000367 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000368 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000369 if size > self.extrasize:
370 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000371
Antoine Pitroub1f88352010-01-03 22:37:40 +0000372 offset = self.offset - self.extrastart
373 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000374 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000375
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000376 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000377 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000378
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200379 def read1(self, size=-1):
380 self._check_closed()
381 if self.mode != READ:
382 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200383 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200384
385 if self.extrasize <= 0 and self.fileobj is None:
386 return b''
387
388 try:
Nadeem Vawda37d3ff12012-08-05 02:19:09 +0200389 # For certain input data, a single call to _read() may not return
390 # any data. In this case, retry until we get some data or reach EOF.
391 while self.extrasize <= 0:
392 self._read()
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200393 except EOFError:
394 pass
395 if size < 0 or size > self.extrasize:
396 size = self.extrasize
397
398 offset = self.offset - self.extrastart
399 chunk = self.extrabuf[offset: offset + size]
400 self.extrasize -= size
401 self.offset += size
402 return chunk
403
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000404 def peek(self, n):
405 if self.mode != READ:
406 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200407 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000408
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000409 # Do not return ridiculously small buffers, for one common idiom
410 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000411 if n < 100:
412 n = 100
413 if self.extrasize == 0:
414 if self.fileobj is None:
415 return b''
416 try:
Nadeem Vawda04354002012-08-05 14:45:41 +0200417 # Ensure that we don't return b"" if we haven't reached EOF.
418 while self.extrasize == 0:
419 # 1024 is the same buffering heuristic used in read()
420 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000421 except EOFError:
422 pass
423 offset = self.offset - self.extrastart
424 remaining = self.extrasize
425 assert remaining == len(self.extrabuf) - offset
426 return self.extrabuf[offset:offset + n]
427
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000428 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000429 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000430 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000431
432 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000433 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000434 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000435
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000436 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000437 # If the _new_member flag is set, we have to
438 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000439 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000440 self._read_gzip_header()
441 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000442 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000443
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000444 # Read a chunk of data from the file
445 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000446
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000447 # If the EOF has been reached, flush the decompression object
448 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000449
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000450 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000451 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000452 # Prepend the already read bytes to the fileobj to they can be
453 # seen by _read_eof()
454 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000455 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000456 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000457 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000458
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000459 uncompress = self.decompress.decompress(buf)
460 self._add_read_data( uncompress )
461
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000462 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000463 # Ending case: we've come to the end of a member in the file,
464 # so seek back to the start of the unused data, finish up
465 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000466 # Prepend the already read bytes to the fileobj to they can be
467 # seen by _read_eof() and _read_gzip_header()
468 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000469 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000470 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000471 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000472 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000473
474 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000475 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000476 offset = self.offset - self.extrastart
477 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000478 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000479 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000480 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000481
482 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000483 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000484 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000485 # uncompressed data matches the stored values. Note that the size
486 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000487 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000488 isize = read32(self.fileobj) # may exceed 2GB
489 if crc32 != self.crc:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200490 raise OSError("CRC check failed %s != %s" % (hex(crc32),
Christian Heimesfe337bf2008-03-23 21:54:12 +0000491 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000492 elif isize != (self.size & 0xffffffff):
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200493 raise OSError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000494
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000495 # Gzip files can be padded with zeroes and still have archives.
496 # Consume all zero bytes and set the file position to the first
497 # non-zero byte. See http://www.gzip.org/#faq8
498 c = b"\x00"
499 while c == b"\x00":
500 c = self.fileobj.read(1)
501 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000502 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000503
Antoine Pitroub1f88352010-01-03 22:37:40 +0000504 @property
505 def closed(self):
506 return self.fileobj is None
507
Guido van Rossum15262191997-04-30 16:04:57 +0000508 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000509 if self.fileobj is None:
510 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000511 if self.mode == WRITE:
512 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000513 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000514 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000515 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000516 self.fileobj = None
517 elif self.mode == READ:
518 self.fileobj = None
519 if self.myfileobj:
520 self.myfileobj.close()
521 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000522
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000523 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000524 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000525 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000526 # Ensure the compressor's buffer is flushed
527 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000528 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000529
Tim Peters5cfb05e2004-07-27 21:02:02 +0000530 def fileno(self):
531 """Invoke the underlying file object's fileno() method.
532
533 This will raise AttributeError if the underlying file object
534 doesn't support fileno().
535 """
536 return self.fileobj.fileno()
537
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000538 def rewind(self):
539 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000540 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000541 if self.mode != READ:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200542 raise OSError("Can't rewind in write mode")
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000543 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000544 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000545 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000546 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000547 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000548 self.offset = 0
549
Antoine Pitroub1f88352010-01-03 22:37:40 +0000550 def readable(self):
551 return self.mode == READ
552
553 def writable(self):
554 return self.mode == WRITE
555
556 def seekable(self):
557 return True
558
Thomas Wouters89f507f2006-12-13 04:49:30 +0000559 def seek(self, offset, whence=0):
560 if whence:
561 if whence == 1:
562 offset = self.offset + offset
563 else:
564 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000565 if self.mode == WRITE:
566 if offset < self.offset:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200567 raise OSError('Negative seek in write mode')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000568 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000569 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000570 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000571 self.write(chunk)
572 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000573 elif self.mode == READ:
574 if offset < self.offset:
575 # for negative seek, rewind and do positive seek
576 self.rewind()
577 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000578 for i in range(count // 1024):
579 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000580 self.read(count % 1024)
581
Antoine Pitroub1f88352010-01-03 22:37:40 +0000582 return self.offset
583
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000584 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000586 # Shortcut common case - newline found in buffer.
587 offset = self.offset - self.extrastart
588 i = self.extrabuf.find(b'\n', offset) + 1
589 if i > 0:
590 self.extrasize -= i - offset
591 self.offset += i - offset
592 return self.extrabuf[offset: i]
593
Christian Heimesa37d4c62007-12-04 23:02:19 +0000594 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000595 readsize = self.min_readsize
596 else:
597 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000598 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000599 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000600 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000601 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000602
603 # We set i=size to break out of the loop under two
604 # conditions: 1) there's no newline, and the chunk is
605 # larger than size, or 2) there is a newline, but the
606 # resulting line would be longer than 'size'.
607 if (size <= i) or (i == -1 and len(c) > size):
608 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000609
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000610 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000611 bufs.append(c[:i + 1]) # Add portion of last chunk
612 self._unread(c[i + 1:]) # Push back rest of chunk
613 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000614
615 # Append chunk to list, decrease 'size',
616 bufs.append(c)
617 size = size - len(c)
618 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000619 if readsize > self.min_readsize:
620 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000621 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000622
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000623
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000624def compress(data, compresslevel=9):
625 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100626 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000627 """
628 buf = io.BytesIO()
629 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
630 f.write(data)
631 return buf.getvalue()
632
633def decompress(data):
634 """Decompress a gzip compressed string in one shot.
635 Return the decompressed string.
636 """
637 with GzipFile(fileobj=io.BytesIO(data)) as f:
638 return f.read()
639
640
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000641def _test():
642 # Act like gzip; with -d, act like gunzip.
643 # The input file is not deleted, however, nor are any other gzip
644 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000645 args = sys.argv[1:]
646 decompress = args and args[0] == "-d"
647 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000648 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000649 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000650 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000651 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000652 if decompress:
653 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000654 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
655 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000656 else:
657 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000658 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000659 continue
660 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000661 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000662 else:
663 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000664 f = sys.stdin.buffer
665 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000666 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000667 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000668 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000669 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000670 chunk = f.read(1024)
671 if not chunk:
672 break
673 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200674 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000675 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200676 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000677 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000678
679if __name__ == '__main__':
680 _test()