blob: 45152e440d52d176c9902af71e81ce777a1ae12c [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020012import _compression
Guido van Rossum15262191997-04-30 16:04:57 +000013
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000014__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000015
Guido van Rossum15262191997-04-30 16:04:57 +000016FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
Nadeem Vawda7e126202012-05-06 15:04:01 +020020def open(filename, mode="rb", compresslevel=9,
21 encoding=None, errors=None, newline=None):
22 """Open a gzip-compressed file in binary or text mode.
23
Nadeem Vawda68721012012-06-04 23:21:38 +020024 The filename argument can be an actual filename (a str or bytes object), or
25 an existing file object to read from or write to.
26
Nadeem Vawdaee1be992013-10-19 00:11:13 +020027 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
28 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
29 "rb", and the default compresslevel is 9.
Nadeem Vawda7e126202012-05-06 15:04:01 +020030
31 For binary mode, this function is equivalent to the GzipFile constructor:
32 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
33 and newline arguments must not be provided.
34
35 For text mode, a GzipFile object is created, and wrapped in an
36 io.TextIOWrapper instance with the specified encoding, error handling
37 behavior, and line ending(s).
38
39 """
40 if "t" in mode:
41 if "b" in mode:
42 raise ValueError("Invalid mode: %r" % (mode,))
43 else:
44 if encoding is not None:
45 raise ValueError("Argument 'encoding' not supported in binary mode")
46 if errors is not None:
47 raise ValueError("Argument 'errors' not supported in binary mode")
48 if newline is not None:
49 raise ValueError("Argument 'newline' not supported in binary mode")
Nadeem Vawda68721012012-06-04 23:21:38 +020050
51 gz_mode = mode.replace("t", "")
52 if isinstance(filename, (str, bytes)):
53 binary_file = GzipFile(filename, gz_mode, compresslevel)
54 elif hasattr(filename, "read") or hasattr(filename, "write"):
55 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
56 else:
57 raise TypeError("filename must be a str or bytes object, or a file")
58
Nadeem Vawda7e126202012-05-06 15:04:01 +020059 if "t" in mode:
60 return io.TextIOWrapper(binary_file, encoding, errors, newline)
61 else:
62 return binary_file
63
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000064def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000065 # The L format writes the bit pattern correctly whether signed
66 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000067 output.write(struct.pack("<L", value))
68
Antoine Pitrou7b969842010-09-23 16:22:51 +000069class _PaddedFile:
70 """Minimal read-only file object that prepends a string to the contents
71 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
72 essential functionality."""
73
74 def __init__(self, f, prepend=b''):
75 self._buffer = prepend
76 self._length = len(prepend)
77 self.file = f
78 self._read = 0
79
80 def read(self, size):
81 if self._read is None:
82 return self.file.read(size)
83 if self._read + size <= self._length:
84 read = self._read
85 self._read += size
86 return self._buffer[read:self._read]
87 else:
88 read = self._read
89 self._read = None
90 return self._buffer[read:] + \
91 self.file.read(size-self._length+read)
92
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020093 def prepend(self, prepend=b''):
Antoine Pitrou7b969842010-09-23 16:22:51 +000094 if self._read is None:
95 self._buffer = prepend
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +020096 else: # Assume data was read since the last prepend() call
Antoine Pitrou7b969842010-09-23 16:22:51 +000097 self._read -= len(prepend)
98 return
Antoine Pitrou7b969842010-09-23 16:22:51 +000099 self._length = len(self._buffer)
100 self._read = 0
101
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200102 def seek(self, off):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000103 self._read = None
104 self._buffer = None
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200105 return self.file.seek(off)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000106
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200107 def seekable(self):
108 return True # Allows fast-forwarding even in unseekable streams
Antoine Pitrou7b969842010-09-23 16:22:51 +0000109
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200110class GzipFile(_compression.BaseStream):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000111 """The GzipFile class simulates most of the methods of a file object with
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200112 the exception of the truncate() method.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000113
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200114 This class only supports opening files in binary mode. If you need to open a
Nadeem Vawda83a4dd32012-06-30 13:34:28 +0200115 compressed file in text mode, use the gzip.open() function.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200116
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000117 """
Guido van Rossum15262191997-04-30 16:04:57 +0000118
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200119 # Overridden with internal file object to be closed, if only a filename
120 # is passed in
Guido van Rossum68de3791997-07-19 20:22:23 +0000121 myfileobj = None
122
Tim Peters07e99cb2001-01-14 23:47:14 +0000123 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000124 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000125 """Constructor for the GzipFile class.
126
127 At least one of fileobj and filename must be given a
128 non-trivial value.
129
130 The new class instance is based on fileobj, which can be a regular
Serhiy Storchaka50254c52013-08-29 11:35:43 +0300131 file, an io.BytesIO object, or any other object which simulates a file.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000132 It defaults to None, in which case filename is opened to provide
133 a file object.
134
135 When fileobj is not None, the filename argument is only used to be
136 included in the gzip file header, which may includes the original
137 filename of the uncompressed file. It defaults to the filename of
138 fileobj, if discernible; otherwise, it defaults to the empty string,
139 and in this case the original filename is not included in the header.
140
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200141 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
142 'xb' depending on whether the file will be read or written. The default
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000143 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200144 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200145 'wb', 'a' and 'ab', and 'x' and 'xb'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000146
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100147 The compresslevel argument is an integer from 0 to 9 controlling the
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000148 level of compression; 1 is fastest and produces the least compression,
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100149 and 9 is slowest and produces the most compression. 0 is no compression
150 at all. The default is 9.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000151
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000152 The mtime argument is an optional numeric timestamp to be written
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200153 to the last modification time field in the stream when compressing.
154 If omitted or None, the current time is used.
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000155
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000156 """
157
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200158 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200159 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000160 if mode and 'b' not in mode:
161 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000162 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000163 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000164 if filename is None:
Nadeem Vawda103e8112012-06-20 01:35:22 +0200165 filename = getattr(fileobj, 'name', '')
166 if not isinstance(filename, (str, bytes)):
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200167 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000168 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200169 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000170
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200171 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 self.mode = READ
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200173 raw = _GzipReader(fileobj)
174 self._buffer = io.BufferedReader(raw)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000175 self.name = filename
Guido van Rossum15262191997-04-30 16:04:57 +0000176
Nadeem Vawdaee1be992013-10-19 00:11:13 +0200177 elif mode.startswith(('w', 'a', 'x')):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000178 self.mode = WRITE
179 self._init_write(filename)
180 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000181 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000182 -zlib.MAX_WBITS,
183 zlib.DEF_MEM_LEVEL,
184 0)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200185 self._write_mtime = mtime
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000186 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200187 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000188
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000189 self.fileobj = fileobj
Guido van Rossum15262191997-04-30 16:04:57 +0000190
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000191 if self.mode == WRITE:
192 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000193
Thomas Wouterscf297e42007-02-23 15:07:44 +0000194 @property
195 def filename(self):
196 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000197 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000198 if self.mode == WRITE and self.name[-3:] != ".gz":
199 return self.name + ".gz"
200 return self.name
201
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200202 @property
203 def mtime(self):
204 """Last modification time read from stream, or None"""
205 return self._buffer.raw._last_mtime
206
Guido van Rossum15262191997-04-30 16:04:57 +0000207 def __repr__(self):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200208 s = repr(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000209 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000210
211 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000212 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000213 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000214 self.size = 0
215 self.writebuf = []
216 self.bufsize = 0
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200217 self.offset = 0 # Current file offset for seek(), tell(), etc
Guido van Rossum15262191997-04-30 16:04:57 +0000218
219 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000220 self.fileobj.write(b'\037\213') # magic header
221 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000222 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000223 # RFC 1952 requires the FNAME field to be Latin-1. Do not
224 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000225 fname = os.path.basename(self.name)
Nadeem Vawda103e8112012-06-20 01:35:22 +0200226 if not isinstance(fname, bytes):
227 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000228 if fname.endswith(b'.gz'):
229 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000230 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000231 fname = b''
232 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000233 if fname:
234 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000235 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200236 mtime = self._write_mtime
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000237 if mtime is None:
238 mtime = time.time()
239 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000240 self.fileobj.write(b'\002')
241 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000242 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000243 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000244
Guido van Rossum15262191997-04-30 16:04:57 +0000245 def write(self,data):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200246 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000247 if self.mode != WRITE:
248 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200249 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000250
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000251 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000252 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000253
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200254 if isinstance(data, bytes):
255 length = len(data)
256 else:
257 # accept any data that supports the buffer protocol
258 data = memoryview(data)
259 length = data.nbytes
Antoine Pitroub1f88352010-01-03 22:37:40 +0000260
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200261 if length > 0:
262 self.fileobj.write(self.compress.compress(data))
263 self.size += length
Christian Heimesfe337bf2008-03-23 21:54:12 +0000264 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200265 self.offset += length
Guido van Rossum15262191997-04-30 16:04:57 +0000266
Serhiy Storchakabca63b32015-03-23 14:59:48 +0200267 return length
Antoine Pitroub1f88352010-01-03 22:37:40 +0000268
Guido van Rossum56068012000-02-02 16:51:06 +0000269 def read(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200270 self._check_not_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000271 if self.mode != READ:
272 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200273 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200274 return self._buffer.read(size)
Guido van Rossum15262191997-04-30 16:04:57 +0000275
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200276 def read1(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200277 """Implements BufferedIOBase.read1()
278
279 Reads up to a buffer's worth of data is size is negative."""
280 self._check_not_closed()
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200281 if self.mode != READ:
282 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200283 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200284
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200285 if size < 0:
286 size = io.DEFAULT_BUFFER_SIZE
287 return self._buffer.read1(size)
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200288
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000289 def peek(self, n):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200290 self._check_not_closed()
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000291 if self.mode != READ:
292 import errno
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200293 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200294 return self._buffer.peek(n)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000295
Antoine Pitroub1f88352010-01-03 22:37:40 +0000296 @property
297 def closed(self):
298 return self.fileobj is None
299
Guido van Rossum15262191997-04-30 16:04:57 +0000300 def close(self):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300301 fileobj = self.fileobj
302 if fileobj is None:
Georg Brandlb533e262008-05-25 18:19:30 +0000303 return
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300304 self.fileobj = None
305 try:
306 if self.mode == WRITE:
307 fileobj.write(self.compress.flush())
308 write32u(fileobj, self.crc)
309 # self.size may exceed 2GB, or even 4GB
310 write32u(fileobj, self.size & 0xffffffff)
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200311 elif self.mode == READ:
312 self._buffer.close()
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300313 finally:
314 myfileobj = self.myfileobj
315 if myfileobj:
316 self.myfileobj = None
317 myfileobj.close()
Guido van Rossum15262191997-04-30 16:04:57 +0000318
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000319 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200320 self._check_not_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000321 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000322 # Ensure the compressor's buffer is flushed
323 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000324 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000325
Tim Peters5cfb05e2004-07-27 21:02:02 +0000326 def fileno(self):
327 """Invoke the underlying file object's fileno() method.
328
329 This will raise AttributeError if the underlying file object
330 doesn't support fileno().
331 """
332 return self.fileobj.fileno()
333
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000334 def rewind(self):
335 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000336 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000337 if self.mode != READ:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200338 raise OSError("Can't rewind in write mode")
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200339 self._buffer.seek(0)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000340
Antoine Pitroub1f88352010-01-03 22:37:40 +0000341 def readable(self):
342 return self.mode == READ
343
344 def writable(self):
345 return self.mode == WRITE
346
347 def seekable(self):
348 return True
349
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200350 def seek(self, offset, whence=io.SEEK_SET):
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000351 if self.mode == WRITE:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200352 if whence != io.SEEK_SET:
353 if whence == io.SEEK_CUR:
354 offset = self.offset + offset
355 else:
356 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000357 if offset < self.offset:
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200358 raise OSError('Negative seek in write mode')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000359 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000360 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000361 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000362 self.write(chunk)
363 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000364 elif self.mode == READ:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200365 self._check_not_closed()
366 return self._buffer.seek(offset, whence)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000367
Antoine Pitroub1f88352010-01-03 22:37:40 +0000368 return self.offset
369
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000370 def readline(self, size=-1):
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200371 self._check_not_closed()
372 return self._buffer.readline(size)
373
374
375class _GzipReader(_compression.DecompressReader):
376 def __init__(self, fp):
377 super().__init__(_PaddedFile(fp), zlib.decompressobj,
378 wbits=-zlib.MAX_WBITS)
379 # Set flag indicating start of a new member
380 self._new_member = True
381 self._last_mtime = None
382
383 def _init_read(self):
384 self._crc = zlib.crc32(b"") & 0xffffffff
385 self._stream_size = 0 # Decompressed size of unconcatenated stream
386
387 def _read_exact(self, n):
388 '''Read exactly *n* bytes from `self._fp`
389
390 This method is required because self._fp may be unbuffered,
391 i.e. return short reads.
392 '''
393
394 data = self._fp.read(n)
395 while len(data) < n:
396 b = self._fp.read(n - len(data))
397 if not b:
398 raise EOFError("Compressed file ended before the "
399 "end-of-stream marker was reached")
400 data += b
401 return data
402
403 def _read_gzip_header(self):
404 magic = self._fp.read(2)
405 if magic == b'':
406 return False
407
408 if magic != b'\037\213':
409 raise OSError('Not a gzipped file (%r)' % magic)
410
411 (method, flag,
412 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
413 if method != 8:
414 raise OSError('Unknown compression method')
415
416 if flag & FEXTRA:
417 # Read & discard the extra field, if present
418 extra_len, = struct.unpack("<H", self._read_exact(2))
419 self._read_exact(extra_len)
420 if flag & FNAME:
421 # Read and discard a null-terminated string containing the filename
422 while True:
423 s = self._fp.read(1)
424 if not s or s==b'\000':
425 break
426 if flag & FCOMMENT:
427 # Read and discard a null-terminated string containing a comment
428 while True:
429 s = self._fp.read(1)
430 if not s or s==b'\000':
431 break
432 if flag & FHCRC:
433 self._read_exact(2) # Read & discard the 16-bit header CRC
434 return True
435
436 def read(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000437 if size < 0:
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200438 return self.readall()
439 # size=0 is special because decompress(max_length=0) is not supported
440 if not size:
441 return b""
Antoine Pitroub1f88352010-01-03 22:37:40 +0000442
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200443 # For certain input data, a single
444 # call to decompress() may not return
445 # any data. In this case, retry until we get some data or reach EOF.
446 while True:
447 if self._decompressor.eof:
448 # Ending case: we've come to the end of a member in the file,
449 # so finish up this member, and read a new gzip header.
450 # Check the CRC and file size, and set the flag so we read
451 # a new member
452 self._read_eof()
453 self._new_member = True
454 self._decompressor = self._decomp_factory(
455 **self._decomp_args)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200457 if self._new_member:
458 # If the _new_member flag is set, we have to
459 # jump to the next member, if there is one.
460 self._init_read()
461 if not self._read_gzip_header():
462 self._size = self._pos
463 return b""
464 self._new_member = False
Guido van Rossum15262191997-04-30 16:04:57 +0000465
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200466 # Read a chunk of data from the file
467 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
468
469 uncompress = self._decompressor.decompress(buf, size)
470 if self._decompressor.unconsumed_tail != b"":
471 self._fp.prepend(self._decompressor.unconsumed_tail)
472 elif self._decompressor.unused_data != b"":
473 # Prepend the already read bytes to the fileobj so they can
474 # be seen by _read_eof() and _read_gzip_header()
475 self._fp.prepend(self._decompressor.unused_data)
476
477 if uncompress != b"":
Thomas Wouters477c8d52006-05-27 19:21:47 +0000478 break
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200479 if buf == b"":
480 raise EOFError("Compressed file ended before the "
481 "end-of-stream marker was reached")
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000482
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200483 self._add_read_data( uncompress )
484 self._pos += len(uncompress)
485 return uncompress
Tim Peters07e99cb2001-01-14 23:47:14 +0000486
Antoine Pitrou2dbc6e62015-04-11 00:31:01 +0200487 def _add_read_data(self, data):
488 self._crc = zlib.crc32(data, self._crc) & 0xffffffff
489 self._stream_size = self._stream_size + len(data)
490
491 def _read_eof(self):
492 # We've read to the end of the file
493 # We check the that the computed CRC and size of the
494 # uncompressed data matches the stored values. Note that the size
495 # stored is the true file size mod 2**32.
496 crc32, isize = struct.unpack("<II", self._read_exact(8))
497 if crc32 != self._crc:
498 raise OSError("CRC check failed %s != %s" % (hex(crc32),
499 hex(self._crc)))
500 elif isize != (self._stream_size & 0xffffffff):
501 raise OSError("Incorrect length of data produced")
502
503 # Gzip files can be padded with zeroes and still have archives.
504 # Consume all zero bytes and set the file position to the first
505 # non-zero byte. See http://www.gzip.org/#faq8
506 c = b"\x00"
507 while c == b"\x00":
508 c = self._fp.read(1)
509 if c:
510 self._fp.prepend(c)
511
512 def _rewind(self):
513 super()._rewind()
514 self._new_member = True
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000515
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000516def compress(data, compresslevel=9):
517 """Compress data in one shot and return the compressed string.
Nadeem Vawda19e568d2012-11-11 14:04:14 +0100518 Optional argument is the compression level, in range of 0-9.
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000519 """
520 buf = io.BytesIO()
521 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
522 f.write(data)
523 return buf.getvalue()
524
525def decompress(data):
526 """Decompress a gzip compressed string in one shot.
527 Return the decompressed string.
528 """
529 with GzipFile(fileobj=io.BytesIO(data)) as f:
530 return f.read()
531
532
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000533def _test():
534 # Act like gzip; with -d, act like gunzip.
535 # The input file is not deleted, however, nor are any other gzip
536 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000537 args = sys.argv[1:]
538 decompress = args and args[0] == "-d"
539 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000540 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000541 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000542 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000543 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000544 if decompress:
545 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000546 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
547 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000548 else:
549 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000550 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000551 continue
552 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000553 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000554 else:
555 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000556 f = sys.stdin.buffer
557 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000558 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000559 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000560 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000561 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000562 chunk = f.read(1024)
563 if not chunk:
564 break
565 g.write(chunk)
Antoine Pitrouecc47572012-08-30 00:29:24 +0200566 if g is not sys.stdout.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000567 g.close()
Antoine Pitrouecc47572012-08-30 00:29:24 +0200568 if f is not sys.stdin.buffer:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000569 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000570
571if __name__ == '__main__':
572 _test()