blob: 85c3e150d96def0f89cd7e0d345d14fe4682e1b9 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000019def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000020 # The L format writes the bit pattern correctly whether signed
21 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000022 output.write(struct.pack("<L", value))
23
Guido van Rossum15262191997-04-30 16:04:57 +000024def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000025 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000026
Fred Drakefa1591c1999-04-05 18:37:59 +000027def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000028 """Shorthand for GzipFile(filename, mode, compresslevel).
29
30 The filename argument is required; mode defaults to 'rb'
31 and compresslevel defaults to 9.
32
33 """
Guido van Rossum15262191997-04-30 16:04:57 +000034 return GzipFile(filename, mode, compresslevel)
35
Antoine Pitrou7b969842010-09-23 16:22:51 +000036class _PaddedFile:
37 """Minimal read-only file object that prepends a string to the contents
38 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
39 essential functionality."""
40
41 def __init__(self, f, prepend=b''):
42 self._buffer = prepend
43 self._length = len(prepend)
44 self.file = f
45 self._read = 0
46
47 def read(self, size):
48 if self._read is None:
49 return self.file.read(size)
50 if self._read + size <= self._length:
51 read = self._read
52 self._read += size
53 return self._buffer[read:self._read]
54 else:
55 read = self._read
56 self._read = None
57 return self._buffer[read:] + \
58 self.file.read(size-self._length+read)
59
60 def prepend(self, prepend=b'', readprevious=False):
61 if self._read is None:
62 self._buffer = prepend
63 elif readprevious and len(prepend) <= self._read:
64 self._read -= len(prepend)
65 return
66 else:
67 self._buffer = self._buffer[read:] + prepend
68 self._length = len(self._buffer)
69 self._read = 0
70
71 def unused(self):
72 if self._read is None:
73 return b''
74 return self._buffer[self._read:]
75
76 def seek(self, offset, whence=0):
77 # This is only ever called with offset=whence=0
78 if whence == 1 and self._read is not None:
79 if 0 <= offset + self._read <= self._length:
80 self._read += offset
81 return
82 else:
83 offset += self._length - self._read
84 self._read = None
85 self._buffer = None
86 return self.file.seek(offset, whence)
87
88 def __getattr__(self, name):
Georg Brandl9f1c1dc2010-11-20 11:25:01 +000089 return getattr(self.file, name)
Antoine Pitrou7b969842010-09-23 16:22:51 +000090
91
Antoine Pitroub1f88352010-01-03 22:37:40 +000092class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000093 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +000094 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000095
Nadeem Vawda30d94b72012-02-11 23:45:10 +020096 This class only supports opening files in binary mode. If you need to open a
97 compressed file in text mode, wrap your GzipFile with an io.TextIOWrapper.
98
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000099 """
Guido van Rossum15262191997-04-30 16:04:57 +0000100
Guido van Rossum68de3791997-07-19 20:22:23 +0000101 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000102 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000103
Tim Peters07e99cb2001-01-14 23:47:14 +0000104 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000105 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000106 """Constructor for the GzipFile class.
107
108 At least one of fileobj and filename must be given a
109 non-trivial value.
110
111 The new class instance is based on fileobj, which can be a regular
112 file, a StringIO object, or any other object which simulates a file.
113 It defaults to None, in which case filename is opened to provide
114 a file object.
115
116 When fileobj is not None, the filename argument is only used to be
117 included in the gzip file header, which may includes the original
118 filename of the uncompressed file. It defaults to the filename of
119 fileobj, if discernible; otherwise, it defaults to the empty string,
120 and in this case the original filename is not included in the header.
121
122 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
123 depending on whether the file will be read or written. The default
124 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200125 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
126 'wb', and 'a' and 'ab'.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000127
128 The compresslevel argument is an integer from 1 to 9 controlling the
129 level of compression; 1 is fastest and produces the least compression,
130 and 9 is slowest and produces the most compression. The default is 9.
131
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000132 The mtime argument is an optional numeric timestamp to be written
133 to the stream when compressing. All gzip compressed streams
134 are required to contain a timestamp. If omitted or None, the
135 current time is used. This module ignores the timestamp when
136 decompressing; however, some programs, such as gunzip, make use
137 of it. The format of the timestamp is the same as that of the
138 return value of time.time() and of the st_mtime member of the
139 object returned by os.stat().
140
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000141 """
142
Nadeem Vawda30d94b72012-02-11 23:45:10 +0200143 if mode and ('t' in mode or 'U' in mode):
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200144 raise ValueError("Invalid mode: {!r}".format(mode))
Skip Montanaro12424bc2002-05-23 01:43:05 +0000145 if mode and 'b' not in mode:
146 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000147 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000148 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000149 if filename is None:
Nadeem Vawda892b0b92012-01-18 09:25:58 +0200150 if hasattr(fileobj, 'name') and isinstance(fileobj.name, str):
151 filename = fileobj.name
152 else:
153 filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000154 if mode is None:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200155 mode = getattr(fileobj, 'mode', 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000156
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200157 if mode.startswith('r'):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000158 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000159 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000160 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000161 # Buffer data read from gzip file. extrastart is offset in
162 # stream where buffer starts. extrasize is number of
163 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000164 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000165 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000166 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000167 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000168 # Starts small, scales exponentially
169 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000170 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000171
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200172 elif mode.startswith(('w', 'a')):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000173 self.mode = WRITE
174 self._init_write(filename)
175 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000176 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000177 -zlib.MAX_WBITS,
178 zlib.DEF_MEM_LEVEL,
179 0)
180 else:
Nadeem Vawdabe66af42012-02-12 00:06:02 +0200181 raise ValueError("Invalid mode: {!r}".format(mode))
Guido van Rossum15262191997-04-30 16:04:57 +0000182
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000183 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000184 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000185 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000186
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000187 if self.mode == WRITE:
188 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000189
Thomas Wouterscf297e42007-02-23 15:07:44 +0000190 @property
191 def filename(self):
192 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000193 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000194 if self.mode == WRITE and self.name[-3:] != ".gz":
195 return self.name + ".gz"
196 return self.name
197
Guido van Rossum15262191997-04-30 16:04:57 +0000198 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000199 fileobj = self.fileobj
200 if isinstance(fileobj, _PaddedFile):
201 fileobj = fileobj.file
202 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000203 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000204
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000205 def _check_closed(self):
206 """Raises a ValueError if the underlying file object has been closed.
207
208 """
209 if self.closed:
210 raise ValueError('I/O operation on closed file.')
211
Guido van Rossum15262191997-04-30 16:04:57 +0000212 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000213 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000214 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000215 self.size = 0
216 self.writebuf = []
217 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000218
219 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000220 self.fileobj.write(b'\037\213') # magic header
221 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000222 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000223 # RFC 1952 requires the FNAME field to be Latin-1. Do not
224 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000225 fname = os.path.basename(self.name)
226 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000227 if fname.endswith(b'.gz'):
228 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000229 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000230 fname = b''
231 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000232 if fname:
233 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000234 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000235 mtime = self.mtime
236 if mtime is None:
237 mtime = time.time()
238 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000239 self.fileobj.write(b'\002')
240 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000241 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000242 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000243
244 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000245 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000246 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000247
248 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000249 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000250 if magic == b'':
251 raise EOFError("Reached EOF")
252
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000253 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000254 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000255 method = ord( self.fileobj.read(1) )
256 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000257 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000258 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000259 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000260 # extraflag = self.fileobj.read(1)
261 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000262 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000263
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000264 if flag & FEXTRA:
265 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000266 xlen = ord(self.fileobj.read(1))
267 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000268 self.fileobj.read(xlen)
269 if flag & FNAME:
270 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000271 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000272 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000273 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000274 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000275 if flag & FCOMMENT:
276 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000277 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000278 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000279 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000280 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000281 if flag & FHCRC:
282 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000283
Antoine Pitrou7b969842010-09-23 16:22:51 +0000284 unused = self.fileobj.unused()
285 if unused:
286 uncompress = self.decompress.decompress(unused)
287 self._add_read_data(uncompress)
288
Guido van Rossum15262191997-04-30 16:04:57 +0000289 def write(self,data):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000290 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000291 if self.mode != WRITE:
292 import errno
293 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000294
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000295 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000296 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000297
298 # Convert data type if called by io.BufferedWriter.
299 if isinstance(data, memoryview):
300 data = data.tobytes()
301
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000302 if len(data) > 0:
303 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000304 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000305 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000306 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000307
Antoine Pitroub1f88352010-01-03 22:37:40 +0000308 return len(data)
309
Guido van Rossum56068012000-02-02 16:51:06 +0000310 def read(self, size=-1):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000311 self._check_closed()
Martin v. Löwisdb044892002-03-11 06:46:52 +0000312 if self.mode != READ:
313 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000314 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000315
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000316 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000317 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000318
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000320 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000321 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000322 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000323 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000324 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000325 except EOFError:
326 size = self.extrasize
327 else: # just get some more of it
328 try:
329 while size > self.extrasize:
330 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000331 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000332 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000333 if size > self.extrasize:
334 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000335
Antoine Pitroub1f88352010-01-03 22:37:40 +0000336 offset = self.offset - self.extrastart
337 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000338 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000339
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000340 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000341 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000342
Antoine Pitrou4ec4b0c2011-04-04 21:00:37 +0200343 def read1(self, size=-1):
344 self._check_closed()
345 if self.mode != READ:
346 import errno
347 raise IOError(errno.EBADF, "read1() on write-only GzipFile object")
348
349 if self.extrasize <= 0 and self.fileobj is None:
350 return b''
351
352 try:
353 self._read()
354 except EOFError:
355 pass
356 if size < 0 or size > self.extrasize:
357 size = self.extrasize
358
359 offset = self.offset - self.extrastart
360 chunk = self.extrabuf[offset: offset + size]
361 self.extrasize -= size
362 self.offset += size
363 return chunk
364
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000365 def peek(self, n):
366 if self.mode != READ:
367 import errno
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000368 raise IOError(errno.EBADF, "peek() on write-only GzipFile object")
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000369
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000370 # Do not return ridiculously small buffers, for one common idiom
371 # is to call peek(1) and expect more bytes in return.
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000372 if n < 100:
373 n = 100
374 if self.extrasize == 0:
375 if self.fileobj is None:
376 return b''
377 try:
Antoine Pitrou7b998e92010-10-04 21:55:14 +0000378 # 1024 is the same buffering heuristic used in read()
379 self._read(max(n, 1024))
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000380 except EOFError:
381 pass
382 offset = self.offset - self.extrastart
383 remaining = self.extrasize
384 assert remaining == len(self.extrabuf) - offset
385 return self.extrabuf[offset:offset + n]
386
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000387 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000388 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000389 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000390
391 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000392 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000393 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000394
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000395 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000396 # If the _new_member flag is set, we have to
397 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000398 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000399 self._read_gzip_header()
400 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000401 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000402
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000403 # Read a chunk of data from the file
404 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000405
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000406 # If the EOF has been reached, flush the decompression object
407 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000408
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000409 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000410 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000411 # Prepend the already read bytes to the fileobj to they can be
412 # seen by _read_eof()
413 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000414 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000415 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000416 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000417
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000418 uncompress = self.decompress.decompress(buf)
419 self._add_read_data( uncompress )
420
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000421 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000422 # Ending case: we've come to the end of a member in the file,
423 # so seek back to the start of the unused data, finish up
424 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000425 # Prepend the already read bytes to the fileobj to they can be
426 # seen by _read_eof() and _read_gzip_header()
427 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000428 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000429 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000430 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000431 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000432
433 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000434 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000435 offset = self.offset - self.extrastart
436 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000437 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000438 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000439 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000440
441 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000442 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000443 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000444 # uncompressed data matches the stored values. Note that the size
445 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000446 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000447 isize = read32(self.fileobj) # may exceed 2GB
448 if crc32 != self.crc:
449 raise IOError("CRC check failed %s != %s" % (hex(crc32),
450 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000451 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000452 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000453
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000454 # Gzip files can be padded with zeroes and still have archives.
455 # Consume all zero bytes and set the file position to the first
456 # non-zero byte. See http://www.gzip.org/#faq8
457 c = b"\x00"
458 while c == b"\x00":
459 c = self.fileobj.read(1)
460 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000461 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000462
Antoine Pitroub1f88352010-01-03 22:37:40 +0000463 @property
464 def closed(self):
465 return self.fileobj is None
466
Guido van Rossum15262191997-04-30 16:04:57 +0000467 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000468 if self.fileobj is None:
469 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000470 if self.mode == WRITE:
471 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000472 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000473 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000474 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000475 self.fileobj = None
476 elif self.mode == READ:
477 self.fileobj = None
478 if self.myfileobj:
479 self.myfileobj.close()
480 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000481
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000482 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
Antoine Pitrou7980eaa2010-10-06 21:21:18 +0000483 self._check_closed()
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000484 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000485 # Ensure the compressor's buffer is flushed
486 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000487 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000488
Tim Peters5cfb05e2004-07-27 21:02:02 +0000489 def fileno(self):
490 """Invoke the underlying file object's fileno() method.
491
492 This will raise AttributeError if the underlying file object
493 doesn't support fileno().
494 """
495 return self.fileobj.fileno()
496
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000497 def rewind(self):
498 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000499 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000500 if self.mode != READ:
501 raise IOError("Can't rewind in write mode")
502 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000503 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000504 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000505 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000506 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000507 self.offset = 0
508
Antoine Pitroub1f88352010-01-03 22:37:40 +0000509 def readable(self):
510 return self.mode == READ
511
512 def writable(self):
513 return self.mode == WRITE
514
515 def seekable(self):
516 return True
517
Thomas Wouters89f507f2006-12-13 04:49:30 +0000518 def seek(self, offset, whence=0):
519 if whence:
520 if whence == 1:
521 offset = self.offset + offset
522 else:
523 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000524 if self.mode == WRITE:
525 if offset < self.offset:
526 raise IOError('Negative seek in write mode')
527 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000528 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000529 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000530 self.write(chunk)
531 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000532 elif self.mode == READ:
533 if offset < self.offset:
534 # for negative seek, rewind and do positive seek
535 self.rewind()
536 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000537 for i in range(count // 1024):
538 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000539 self.read(count % 1024)
540
Antoine Pitroub1f88352010-01-03 22:37:40 +0000541 return self.offset
542
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000543 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000545 # Shortcut common case - newline found in buffer.
546 offset = self.offset - self.extrastart
547 i = self.extrabuf.find(b'\n', offset) + 1
548 if i > 0:
549 self.extrasize -= i - offset
550 self.offset += i - offset
551 return self.extrabuf[offset: i]
552
Christian Heimesa37d4c62007-12-04 23:02:19 +0000553 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554 readsize = self.min_readsize
555 else:
556 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000557 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000559 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000560 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000561
562 # We set i=size to break out of the loop under two
563 # conditions: 1) there's no newline, and the chunk is
564 # larger than size, or 2) there is a newline, but the
565 # resulting line would be longer than 'size'.
566 if (size <= i) or (i == -1 and len(c) > size):
567 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000568
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000569 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570 bufs.append(c[:i + 1]) # Add portion of last chunk
571 self._unread(c[i + 1:]) # Push back rest of chunk
572 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000573
574 # Append chunk to list, decrease 'size',
575 bufs.append(c)
576 size = size - len(c)
577 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000578 if readsize > self.min_readsize:
579 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000580 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000581
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000582
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000583def compress(data, compresslevel=9):
584 """Compress data in one shot and return the compressed string.
585 Optional argument is the compression level, in range of 1-9.
586 """
587 buf = io.BytesIO()
588 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
589 f.write(data)
590 return buf.getvalue()
591
592def decompress(data):
593 """Decompress a gzip compressed string in one shot.
594 Return the decompressed string.
595 """
596 with GzipFile(fileobj=io.BytesIO(data)) as f:
597 return f.read()
598
599
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000600def _test():
601 # Act like gzip; with -d, act like gunzip.
602 # The input file is not deleted, however, nor are any other gzip
603 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000604 args = sys.argv[1:]
605 decompress = args and args[0] == "-d"
606 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000607 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000608 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000609 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000610 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000611 if decompress:
612 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000613 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
614 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000615 else:
616 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000617 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000618 continue
619 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000620 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000621 else:
622 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000623 f = sys.stdin.buffer
624 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000625 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000626 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000627 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000628 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000629 chunk = f.read(1024)
630 if not chunk:
631 break
632 g.write(chunk)
633 if g is not sys.stdout:
634 g.close()
635 if f is not sys.stdin:
636 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000637
638if __name__ == '__main__':
639 _test()