blob: 3edc8395d68a9ef51838345a44ea4690f37943f6 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Tim Petersfb0ea522002-11-04 19:50:11 +000019def U32(i):
20 """Return i as an unsigned integer, assuming it fits in 32 bits.
Tim Petersfb0ea522002-11-04 19:50:11 +000021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
Guido van Rossume2a383d2007-01-15 16:59:06 +000024 i += 1 << 32
Tim Petersfb0ea522002-11-04 19:50:11 +000025 return i
26
Tim Peters9288f952002-11-05 20:38:55 +000027def LOWU32(i):
Christian Heimesfe337bf2008-03-23 21:54:12 +000028 """Return the low-order 32 bits, as a non-negative int"""
Guido van Rossume2a383d2007-01-15 16:59:06 +000029 return i & 0xFFFFFFFF
Tim Peters9288f952002-11-05 20:38:55 +000030
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000031def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000032 # The L format writes the bit pattern correctly whether signed
33 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000034 output.write(struct.pack("<L", value))
35
Guido van Rossum15262191997-04-30 16:04:57 +000036def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000037 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000038
Fred Drakefa1591c1999-04-05 18:37:59 +000039def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000040 """Shorthand for GzipFile(filename, mode, compresslevel).
41
42 The filename argument is required; mode defaults to 'rb'
43 and compresslevel defaults to 9.
44
45 """
Guido van Rossum15262191997-04-30 16:04:57 +000046 return GzipFile(filename, mode, compresslevel)
47
Antoine Pitrou7b969842010-09-23 16:22:51 +000048class _PaddedFile:
49 """Minimal read-only file object that prepends a string to the contents
50 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
51 essential functionality."""
52
53 def __init__(self, f, prepend=b''):
54 self._buffer = prepend
55 self._length = len(prepend)
56 self.file = f
57 self._read = 0
58
59 def read(self, size):
60 if self._read is None:
61 return self.file.read(size)
62 if self._read + size <= self._length:
63 read = self._read
64 self._read += size
65 return self._buffer[read:self._read]
66 else:
67 read = self._read
68 self._read = None
69 return self._buffer[read:] + \
70 self.file.read(size-self._length+read)
71
72 def prepend(self, prepend=b'', readprevious=False):
73 if self._read is None:
74 self._buffer = prepend
75 elif readprevious and len(prepend) <= self._read:
76 self._read -= len(prepend)
77 return
78 else:
79 self._buffer = self._buffer[read:] + prepend
80 self._length = len(self._buffer)
81 self._read = 0
82
83 def unused(self):
84 if self._read is None:
85 return b''
86 return self._buffer[self._read:]
87
88 def seek(self, offset, whence=0):
89 # This is only ever called with offset=whence=0
90 if whence == 1 and self._read is not None:
91 if 0 <= offset + self._read <= self._length:
92 self._read += offset
93 return
94 else:
95 offset += self._length - self._read
96 self._read = None
97 self._buffer = None
98 return self.file.seek(offset, whence)
99
100 def __getattr__(self, name):
101 return getattr(name, self.file)
102
103
Antoine Pitroub1f88352010-01-03 22:37:40 +0000104class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000105 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000106 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000107
108 """
Guido van Rossum15262191997-04-30 16:04:57 +0000109
Guido van Rossum68de3791997-07-19 20:22:23 +0000110 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000111 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000112
Tim Peters07e99cb2001-01-14 23:47:14 +0000113 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000114 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000115 """Constructor for the GzipFile class.
116
117 At least one of fileobj and filename must be given a
118 non-trivial value.
119
120 The new class instance is based on fileobj, which can be a regular
121 file, a StringIO object, or any other object which simulates a file.
122 It defaults to None, in which case filename is opened to provide
123 a file object.
124
125 When fileobj is not None, the filename argument is only used to be
126 included in the gzip file header, which may includes the original
127 filename of the uncompressed file. It defaults to the filename of
128 fileobj, if discernible; otherwise, it defaults to the empty string,
129 and in this case the original filename is not included in the header.
130
131 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
132 depending on whether the file will be read or written. The default
133 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
134 Be aware that only the 'rb', 'ab', and 'wb' values should be used
135 for cross-platform portability.
136
137 The compresslevel argument is an integer from 1 to 9 controlling the
138 level of compression; 1 is fastest and produces the least compression,
139 and 9 is slowest and produces the most compression. The default is 9.
140
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000141 The mtime argument is an optional numeric timestamp to be written
142 to the stream when compressing. All gzip compressed streams
143 are required to contain a timestamp. If omitted or None, the
144 current time is used. This module ignores the timestamp when
145 decompressing; however, some programs, such as gunzip, make use
146 of it. The format of the timestamp is the same as that of the
147 return value of time.time() and of the st_mtime member of the
148 object returned by os.stat().
149
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000150 """
151
Skip Montanaro12424bc2002-05-23 01:43:05 +0000152 # guarantee the file is opened in binary mode on platforms
153 # that care about that sort of thing
154 if mode and 'b' not in mode:
155 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000156 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000157 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000158 if filename is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 if hasattr(fileobj, 'name'): filename = fileobj.name
160 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000161 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000162 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000163 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000164
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000165 if mode[0:1] == 'r':
166 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000167 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000168 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000169 # Buffer data read from gzip file. extrastart is offset in
170 # stream where buffer starts. extrasize is number of
171 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000172 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000173 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000174 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000175 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000176 # Starts small, scales exponentially
177 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000178 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000179
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000180 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000181 self.mode = WRITE
182 self._init_write(filename)
183 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000184 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 -zlib.MAX_WBITS,
186 zlib.DEF_MEM_LEVEL,
187 0)
188 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000189 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000190
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000191 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000192 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000193 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000194
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 if self.mode == WRITE:
196 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000197
Thomas Wouterscf297e42007-02-23 15:07:44 +0000198 @property
199 def filename(self):
200 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000201 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000202 if self.mode == WRITE and self.name[-3:] != ".gz":
203 return self.name + ".gz"
204 return self.name
205
Guido van Rossum15262191997-04-30 16:04:57 +0000206 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000207 s = repr(self.fileobj)
208 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000209
210 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000211 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000212 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 self.size = 0
214 self.writebuf = []
215 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000216
217 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000218 self.fileobj.write(b'\037\213') # magic header
219 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000220 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000221 # RFC 1952 requires the FNAME field to be Latin-1. Do not
222 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000223 fname = os.path.basename(self.name)
224 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000225 if fname.endswith(b'.gz'):
226 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000227 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000228 fname = b''
229 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000230 if fname:
231 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000232 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000233 mtime = self.mtime
234 if mtime is None:
235 mtime = time.time()
236 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000237 self.fileobj.write(b'\002')
238 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000239 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000240 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000241
242 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000243 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000244 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000245
246 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000247 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000248 if magic == b'':
249 raise EOFError("Reached EOF")
250
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000251 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000252 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000253 method = ord( self.fileobj.read(1) )
254 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000255 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000257 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000258 # extraflag = self.fileobj.read(1)
259 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000260 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000261
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000262 if flag & FEXTRA:
263 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000264 xlen = ord(self.fileobj.read(1))
265 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000266 self.fileobj.read(xlen)
267 if flag & FNAME:
268 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000269 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000270 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000271 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000272 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 if flag & FCOMMENT:
274 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000275 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000276 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000277 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000278 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000279 if flag & FHCRC:
280 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000281
Antoine Pitrou7b969842010-09-23 16:22:51 +0000282 unused = self.fileobj.unused()
283 if unused:
284 uncompress = self.decompress.decompress(unused)
285 self._add_read_data(uncompress)
286
Guido van Rossum15262191997-04-30 16:04:57 +0000287 def write(self,data):
Martin v. Löwisdb044892002-03-11 06:46:52 +0000288 if self.mode != WRITE:
289 import errno
290 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000291
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000292 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000293 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000294
295 # Convert data type if called by io.BufferedWriter.
296 if isinstance(data, memoryview):
297 data = data.tobytes()
298
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000299 if len(data) > 0:
300 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000301 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000302 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000303 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000304
Antoine Pitroub1f88352010-01-03 22:37:40 +0000305 return len(data)
306
Guido van Rossum56068012000-02-02 16:51:06 +0000307 def read(self, size=-1):
Martin v. Löwisdb044892002-03-11 06:46:52 +0000308 if self.mode != READ:
309 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000310 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000311
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000312 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000313 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000314
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000316 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000317 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000318 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000320 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000321 except EOFError:
322 size = self.extrasize
323 else: # just get some more of it
324 try:
325 while size > self.extrasize:
326 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000327 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000328 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000329 if size > self.extrasize:
330 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000331
Antoine Pitroub1f88352010-01-03 22:37:40 +0000332 offset = self.offset - self.extrastart
333 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000334 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000335
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000336 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000337 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000338
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000339 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000340 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000341 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000342
343 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000344 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000345 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000346
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000347 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000348 # If the _new_member flag is set, we have to
349 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000350 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000351 self._read_gzip_header()
352 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000353 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000354
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000355 # Read a chunk of data from the file
356 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000357
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000358 # If the EOF has been reached, flush the decompression object
359 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000360
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000361 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000362 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000363 # Prepend the already read bytes to the fileobj to they can be
364 # seen by _read_eof()
365 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000366 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000367 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000368 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000369
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000370 uncompress = self.decompress.decompress(buf)
371 self._add_read_data( uncompress )
372
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000373 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000374 # Ending case: we've come to the end of a member in the file,
375 # so seek back to the start of the unused data, finish up
376 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000377 # Prepend the already read bytes to the fileobj to they can be
378 # seen by _read_eof() and _read_gzip_header()
379 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000380 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000381 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000382 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000383 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000384
385 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000386 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000387 offset = self.offset - self.extrastart
388 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000389 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000390 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000391 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000392
393 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000394 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000395 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000396 # uncompressed data matches the stored values. Note that the size
397 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000398 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000399 isize = read32(self.fileobj) # may exceed 2GB
400 if crc32 != self.crc:
401 raise IOError("CRC check failed %s != %s" % (hex(crc32),
402 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000403 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000404 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000405
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000406 # Gzip files can be padded with zeroes and still have archives.
407 # Consume all zero bytes and set the file position to the first
408 # non-zero byte. See http://www.gzip.org/#faq8
409 c = b"\x00"
410 while c == b"\x00":
411 c = self.fileobj.read(1)
412 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000413 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000414
Antoine Pitroub1f88352010-01-03 22:37:40 +0000415 @property
416 def closed(self):
417 return self.fileobj is None
418
Guido van Rossum15262191997-04-30 16:04:57 +0000419 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000420 if self.fileobj is None:
421 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000422 if self.mode == WRITE:
423 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000424 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000425 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000426 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000427 self.fileobj = None
428 elif self.mode == READ:
429 self.fileobj = None
430 if self.myfileobj:
431 self.myfileobj.close()
432 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000433
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000434 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
435 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000436 # Ensure the compressor's buffer is flushed
437 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000438 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000439
Tim Peters5cfb05e2004-07-27 21:02:02 +0000440 def fileno(self):
441 """Invoke the underlying file object's fileno() method.
442
443 This will raise AttributeError if the underlying file object
444 doesn't support fileno().
445 """
446 return self.fileobj.fileno()
447
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000448 def rewind(self):
449 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000450 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000451 if self.mode != READ:
452 raise IOError("Can't rewind in write mode")
453 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000454 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000455 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000456 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000457 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000458 self.offset = 0
459
Antoine Pitroub1f88352010-01-03 22:37:40 +0000460 def readable(self):
461 return self.mode == READ
462
463 def writable(self):
464 return self.mode == WRITE
465
466 def seekable(self):
467 return True
468
Thomas Wouters89f507f2006-12-13 04:49:30 +0000469 def seek(self, offset, whence=0):
470 if whence:
471 if whence == 1:
472 offset = self.offset + offset
473 else:
474 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000475 if self.mode == WRITE:
476 if offset < self.offset:
477 raise IOError('Negative seek in write mode')
478 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000479 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000480 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000481 self.write(chunk)
482 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000483 elif self.mode == READ:
484 if offset < self.offset:
485 # for negative seek, rewind and do positive seek
486 self.rewind()
487 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000488 for i in range(count // 1024):
489 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000490 self.read(count % 1024)
491
Antoine Pitroub1f88352010-01-03 22:37:40 +0000492 return self.offset
493
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000494 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000495 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000496 # Shortcut common case - newline found in buffer.
497 offset = self.offset - self.extrastart
498 i = self.extrabuf.find(b'\n', offset) + 1
499 if i > 0:
500 self.extrasize -= i - offset
501 self.offset += i - offset
502 return self.extrabuf[offset: i]
503
Christian Heimesa37d4c62007-12-04 23:02:19 +0000504 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000505 readsize = self.min_readsize
506 else:
507 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000508 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000509 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000510 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000511 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512
513 # We set i=size to break out of the loop under two
514 # conditions: 1) there's no newline, and the chunk is
515 # larger than size, or 2) there is a newline, but the
516 # resulting line would be longer than 'size'.
517 if (size <= i) or (i == -1 and len(c) > size):
518 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000519
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000520 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521 bufs.append(c[:i + 1]) # Add portion of last chunk
522 self._unread(c[i + 1:]) # Push back rest of chunk
523 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000524
525 # Append chunk to list, decrease 'size',
526 bufs.append(c)
527 size = size - len(c)
528 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529 if readsize > self.min_readsize:
530 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000531 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000532
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000533
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000534def compress(data, compresslevel=9):
535 """Compress data in one shot and return the compressed string.
536 Optional argument is the compression level, in range of 1-9.
537 """
538 buf = io.BytesIO()
539 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
540 f.write(data)
541 return buf.getvalue()
542
543def decompress(data):
544 """Decompress a gzip compressed string in one shot.
545 Return the decompressed string.
546 """
547 with GzipFile(fileobj=io.BytesIO(data)) as f:
548 return f.read()
549
550
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000551def _test():
552 # Act like gzip; with -d, act like gunzip.
553 # The input file is not deleted, however, nor are any other gzip
554 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000555 args = sys.argv[1:]
556 decompress = args and args[0] == "-d"
557 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000558 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000559 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000560 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000561 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000562 if decompress:
563 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000564 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
565 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000566 else:
567 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000568 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000569 continue
570 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000571 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000572 else:
573 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000574 f = sys.stdin.buffer
575 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000576 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000577 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000578 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000579 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000580 chunk = f.read(1024)
581 if not chunk:
582 break
583 g.write(chunk)
584 if g is not sys.stdout:
585 g.close()
586 if f is not sys.stdin:
587 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000588
589if __name__ == '__main__':
590 _test()