blob: 58e866b132d1cabdb28995bd8e44c1a514d222e2 [file] [log] [blame]
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00001"""Functions that read and write gzipped files.
2
Guido van Rossum54f22ed2000-02-04 15:10:34 +00003The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
Lars Gustäbel1440df22009-10-29 09:39:47 +00008import struct, sys, time, os
Guido van Rossum15262191997-04-30 16:04:57 +00009import zlib
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins
Antoine Pitroub1f88352010-01-03 22:37:40 +000011import io
Guido van Rossum15262191997-04-30 16:04:57 +000012
Antoine Pitrou79c5ef12010-08-17 21:10:05 +000013__all__ = ["GzipFile", "open", "compress", "decompress"]
Skip Montanaro2dd42762001-01-23 15:35:05 +000014
Guido van Rossum15262191997-04-30 16:04:57 +000015FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
Tim Petersfb0ea522002-11-04 19:50:11 +000019def U32(i):
20 """Return i as an unsigned integer, assuming it fits in 32 bits.
Tim Petersfb0ea522002-11-04 19:50:11 +000021 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
Guido van Rossume2a383d2007-01-15 16:59:06 +000024 i += 1 << 32
Tim Petersfb0ea522002-11-04 19:50:11 +000025 return i
26
Tim Peters9288f952002-11-05 20:38:55 +000027def LOWU32(i):
Christian Heimesfe337bf2008-03-23 21:54:12 +000028 """Return the low-order 32 bits, as a non-negative int"""
Guido van Rossume2a383d2007-01-15 16:59:06 +000029 return i & 0xFFFFFFFF
Tim Peters9288f952002-11-05 20:38:55 +000030
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000031def write32u(output, value):
Tim Petersfb0ea522002-11-04 19:50:11 +000032 # The L format writes the bit pattern correctly whether signed
33 # or unsigned.
Guido van Rossum95bdd0b1999-04-12 14:34:16 +000034 output.write(struct.pack("<L", value))
35
Guido van Rossum15262191997-04-30 16:04:57 +000036def read32(input):
Christian Heimesfe337bf2008-03-23 21:54:12 +000037 return struct.unpack("<I", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000038
Fred Drakefa1591c1999-04-05 18:37:59 +000039def open(filename, mode="rb", compresslevel=9):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000040 """Shorthand for GzipFile(filename, mode, compresslevel).
41
42 The filename argument is required; mode defaults to 'rb'
43 and compresslevel defaults to 9.
44
45 """
Guido van Rossum15262191997-04-30 16:04:57 +000046 return GzipFile(filename, mode, compresslevel)
47
Antoine Pitrou7b969842010-09-23 16:22:51 +000048class _PaddedFile:
49 """Minimal read-only file object that prepends a string to the contents
50 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
51 essential functionality."""
52
53 def __init__(self, f, prepend=b''):
54 self._buffer = prepend
55 self._length = len(prepend)
56 self.file = f
57 self._read = 0
58
59 def read(self, size):
60 if self._read is None:
61 return self.file.read(size)
62 if self._read + size <= self._length:
63 read = self._read
64 self._read += size
65 return self._buffer[read:self._read]
66 else:
67 read = self._read
68 self._read = None
69 return self._buffer[read:] + \
70 self.file.read(size-self._length+read)
71
72 def prepend(self, prepend=b'', readprevious=False):
73 if self._read is None:
74 self._buffer = prepend
75 elif readprevious and len(prepend) <= self._read:
76 self._read -= len(prepend)
77 return
78 else:
79 self._buffer = self._buffer[read:] + prepend
80 self._length = len(self._buffer)
81 self._read = 0
82
83 def unused(self):
84 if self._read is None:
85 return b''
86 return self._buffer[self._read:]
87
88 def seek(self, offset, whence=0):
89 # This is only ever called with offset=whence=0
90 if whence == 1 and self._read is not None:
91 if 0 <= offset + self._read <= self._length:
92 self._read += offset
93 return
94 else:
95 offset += self._length - self._read
96 self._read = None
97 self._buffer = None
98 return self.file.seek(offset, whence)
99
100 def __getattr__(self, name):
101 return getattr(name, self.file)
102
103
Antoine Pitroub1f88352010-01-03 22:37:40 +0000104class GzipFile(io.BufferedIOBase):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000105 """The GzipFile class simulates most of the methods of a file object with
Guido van Rossum97c5fcc2002-08-06 17:03:25 +0000106 the exception of the readinto() and truncate() methods.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000107
108 """
Guido van Rossum15262191997-04-30 16:04:57 +0000109
Guido van Rossum68de3791997-07-19 20:22:23 +0000110 myfileobj = None
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000111 max_read_chunk = 10 * 1024 * 1024 # 10Mb
Guido van Rossum68de3791997-07-19 20:22:23 +0000112
Tim Peters07e99cb2001-01-14 23:47:14 +0000113 def __init__(self, filename=None, mode=None,
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000114 compresslevel=9, fileobj=None, mtime=None):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000115 """Constructor for the GzipFile class.
116
117 At least one of fileobj and filename must be given a
118 non-trivial value.
119
120 The new class instance is based on fileobj, which can be a regular
121 file, a StringIO object, or any other object which simulates a file.
122 It defaults to None, in which case filename is opened to provide
123 a file object.
124
125 When fileobj is not None, the filename argument is only used to be
126 included in the gzip file header, which may includes the original
127 filename of the uncompressed file. It defaults to the filename of
128 fileobj, if discernible; otherwise, it defaults to the empty string,
129 and in this case the original filename is not included in the header.
130
131 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
132 depending on whether the file will be read or written. The default
133 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
134 Be aware that only the 'rb', 'ab', and 'wb' values should be used
135 for cross-platform portability.
136
137 The compresslevel argument is an integer from 1 to 9 controlling the
138 level of compression; 1 is fastest and produces the least compression,
139 and 9 is slowest and produces the most compression. The default is 9.
140
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000141 The mtime argument is an optional numeric timestamp to be written
142 to the stream when compressing. All gzip compressed streams
143 are required to contain a timestamp. If omitted or None, the
144 current time is used. This module ignores the timestamp when
145 decompressing; however, some programs, such as gunzip, make use
146 of it. The format of the timestamp is the same as that of the
147 return value of time.time() and of the st_mtime member of the
148 object returned by os.stat().
149
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000150 """
151
Skip Montanaro12424bc2002-05-23 01:43:05 +0000152 # guarantee the file is opened in binary mode on platforms
153 # that care about that sort of thing
154 if mode and 'b' not in mode:
155 mode += 'b'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000156 if fileobj is None:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000157 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
Guido van Rossum68de3791997-07-19 20:22:23 +0000158 if filename is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 if hasattr(fileobj, 'name'): filename = fileobj.name
160 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +0000161 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000162 if hasattr(fileobj, 'mode'): mode = fileobj.mode
Fred Drake9bb76d11999-04-05 18:33:40 +0000163 else: mode = 'rb'
Guido van Rossum68de3791997-07-19 20:22:23 +0000164
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000165 if mode[0:1] == 'r':
166 self.mode = READ
Tim Peters07e99cb2001-01-14 23:47:14 +0000167 # Set flag indicating start of a new member
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000168 self._new_member = True
Antoine Pitroub1f88352010-01-03 22:37:40 +0000169 # Buffer data read from gzip file. extrastart is offset in
170 # stream where buffer starts. extrasize is number of
171 # bytes remaining in buffer from current stream position.
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000172 self.extrabuf = b""
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000173 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000174 self.extrastart = 0
Thomas Wouterscf297e42007-02-23 15:07:44 +0000175 self.name = filename
Thomas Wouters477c8d52006-05-27 19:21:47 +0000176 # Starts small, scales exponentially
177 self.min_readsize = 100
Antoine Pitrou7b969842010-09-23 16:22:51 +0000178 fileobj = _PaddedFile(fileobj)
Guido van Rossum15262191997-04-30 16:04:57 +0000179
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000180 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000181 self.mode = WRITE
182 self._init_write(filename)
183 self.compress = zlib.compressobj(compresslevel,
Tim Peters07e99cb2001-01-14 23:47:14 +0000184 zlib.DEFLATED,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 -zlib.MAX_WBITS,
186 zlib.DEF_MEM_LEVEL,
187 0)
188 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000189 raise IOError("Mode " + mode + " not supported")
Guido van Rossum15262191997-04-30 16:04:57 +0000190
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000191 self.fileobj = fileobj
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000192 self.offset = 0
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000193 self.mtime = mtime
Guido van Rossum15262191997-04-30 16:04:57 +0000194
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 if self.mode == WRITE:
196 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +0000197
Thomas Wouterscf297e42007-02-23 15:07:44 +0000198 @property
199 def filename(self):
200 import warnings
Philip Jenveya394f2d2009-05-08 03:57:12 +0000201 warnings.warn("use the name attribute", DeprecationWarning, 2)
Thomas Wouterscf297e42007-02-23 15:07:44 +0000202 if self.mode == WRITE and self.name[-3:] != ".gz":
203 return self.name + ".gz"
204 return self.name
205
Guido van Rossum15262191997-04-30 16:04:57 +0000206 def __repr__(self):
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000207 fileobj = self.fileobj
208 if isinstance(fileobj, _PaddedFile):
209 fileobj = fileobj.file
210 s = repr(fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +0000212
213 def _init_write(self, filename):
Thomas Wouterscf297e42007-02-23 15:07:44 +0000214 self.name = filename
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000215 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000216 self.size = 0
217 self.writebuf = []
218 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000219
220 def _write_gzip_header(self):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000221 self.fileobj.write(b'\037\213') # magic header
222 self.fileobj.write(b'\010') # compression method
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000223 try:
Lars Gustäbelead70562007-08-13 09:05:16 +0000224 # RFC 1952 requires the FNAME field to be Latin-1. Do not
225 # include filenames that cannot be represented that way.
Lars Gustäbel1440df22009-10-29 09:39:47 +0000226 fname = os.path.basename(self.name)
227 fname = fname.encode('latin-1')
Lars Gustäbelead70562007-08-13 09:05:16 +0000228 if fname.endswith(b'.gz'):
229 fname = fname[:-3]
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000230 except UnicodeEncodeError:
Lars Gustäbelead70562007-08-13 09:05:16 +0000231 fname = b''
232 flags = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000233 if fname:
234 flags = FNAME
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000235 self.fileobj.write(chr(flags).encode('latin-1'))
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000236 mtime = self.mtime
237 if mtime is None:
238 mtime = time.time()
239 write32u(self.fileobj, int(mtime))
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000240 self.fileobj.write(b'\002')
241 self.fileobj.write(b'\377')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000242 if fname:
Lars Gustäbel5590d8c2007-08-10 12:02:32 +0000243 self.fileobj.write(fname + b'\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000244
245 def _init_read(self):
Antoine Pitrou77b338b2009-12-14 18:00:06 +0000246 self.crc = zlib.crc32(b"") & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000247 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +0000248
249 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000250 magic = self.fileobj.read(2)
Antoine Pitrou7b969842010-09-23 16:22:51 +0000251 if magic == b'':
252 raise EOFError("Reached EOF")
253
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000254 if magic != b'\037\213':
Collin Winterce36ad82007-08-30 01:19:48 +0000255 raise IOError('Not a gzipped file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 method = ord( self.fileobj.read(1) )
257 if method != 8:
Collin Winterce36ad82007-08-30 01:19:48 +0000258 raise IOError('Unknown compression method')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000259 flag = ord( self.fileobj.read(1) )
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000260 self.mtime = read32(self.fileobj)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000261 # extraflag = self.fileobj.read(1)
262 # os = self.fileobj.read(1)
Antoine Pitrou42db3ef2009-01-04 21:37:59 +0000263 self.fileobj.read(2)
Guido van Rossum15262191997-04-30 16:04:57 +0000264
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000265 if flag & FEXTRA:
266 # Read & discard the extra field, if present
Tim Petersfb0ea522002-11-04 19:50:11 +0000267 xlen = ord(self.fileobj.read(1))
268 xlen = xlen + 256*ord(self.fileobj.read(1))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000269 self.fileobj.read(xlen)
270 if flag & FNAME:
271 # Read and discard a null-terminated string containing the filename
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000272 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000273 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000274 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000275 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 if flag & FCOMMENT:
277 # Read and discard a null-terminated string containing a comment
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000278 while True:
Tim Petersfb0ea522002-11-04 19:50:11 +0000279 s = self.fileobj.read(1)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000280 if not s or s==b'\000':
Tim Petersfb0ea522002-11-04 19:50:11 +0000281 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000282 if flag & FHCRC:
283 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000284
Antoine Pitrou7b969842010-09-23 16:22:51 +0000285 unused = self.fileobj.unused()
286 if unused:
287 uncompress = self.decompress.decompress(unused)
288 self._add_read_data(uncompress)
289
Guido van Rossum15262191997-04-30 16:04:57 +0000290 def write(self,data):
Martin v. Löwisdb044892002-03-11 06:46:52 +0000291 if self.mode != WRITE:
292 import errno
293 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000294
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000295 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000296 raise ValueError("write() on closed GzipFile object")
Antoine Pitroub1f88352010-01-03 22:37:40 +0000297
298 # Convert data type if called by io.BufferedWriter.
299 if isinstance(data, memoryview):
300 data = data.tobytes()
301
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000302 if len(data) > 0:
303 self.size = self.size + len(data)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000304 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000305 self.fileobj.write( self.compress.compress(data) )
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000306 self.offset += len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000307
Antoine Pitroub1f88352010-01-03 22:37:40 +0000308 return len(data)
309
Guido van Rossum56068012000-02-02 16:51:06 +0000310 def read(self, size=-1):
Martin v. Löwisdb044892002-03-11 06:46:52 +0000311 if self.mode != READ:
312 import errno
Brett Cannonedfb3022003-12-04 19:28:06 +0000313 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
Tim Peters863ac442002-04-16 01:38:40 +0000314
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 if self.extrasize <= 0 and self.fileobj is None:
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000316 return b''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000317
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000318 readsize = 1024
Guido van Rossum56068012000-02-02 16:51:06 +0000319 if size < 0: # get the whole thing
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000320 try:
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000321 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000322 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000323 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000324 except EOFError:
325 size = self.extrasize
326 else: # just get some more of it
327 try:
328 while size > self.extrasize:
329 self._read(readsize)
Andrew M. Kuchling01cb47b2005-06-09 14:19:32 +0000330 readsize = min(self.max_read_chunk, readsize * 2)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000331 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000332 if size > self.extrasize:
333 size = self.extrasize
Tim Peters07e99cb2001-01-14 23:47:14 +0000334
Antoine Pitroub1f88352010-01-03 22:37:40 +0000335 offset = self.offset - self.extrastart
336 chunk = self.extrabuf[offset: offset + size]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000337 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000338
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000339 self.offset += size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000340 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000341
Antoine Pitrouc3ed2e72010-09-29 10:49:46 +0000342 def peek(self, n):
343 if self.mode != READ:
344 import errno
345 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
346
347 # Do not return ridiculously small buffers
348 if n < 100:
349 n = 100
350 if self.extrasize == 0:
351 if self.fileobj is None:
352 return b''
353 try:
354 self._read(max(self.max_read_chunk, n))
355 except EOFError:
356 pass
357 offset = self.offset - self.extrastart
358 remaining = self.extrasize
359 assert remaining == len(self.extrabuf) - offset
360 return self.extrabuf[offset:offset + n]
361
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000362 def _unread(self, buf):
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000363 self.extrasize = len(buf) + self.extrasize
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000364 self.offset -= len(buf)
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000365
366 def _read(self, size=1024):
Tim Petersfb0ea522002-11-04 19:50:11 +0000367 if self.fileobj is None:
Collin Winterce36ad82007-08-30 01:19:48 +0000368 raise EOFError("Reached EOF")
Tim Peters07e99cb2001-01-14 23:47:14 +0000369
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000370 if self._new_member:
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000371 # If the _new_member flag is set, we have to
372 # jump to the next member, if there is one.
Tim Peters07e99cb2001-01-14 23:47:14 +0000373 self._init_read()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000374 self._read_gzip_header()
375 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000376 self._new_member = False
Tim Peters07e99cb2001-01-14 23:47:14 +0000377
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000378 # Read a chunk of data from the file
379 buf = self.fileobj.read(size)
Tim Peters07e99cb2001-01-14 23:47:14 +0000380
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000381 # If the EOF has been reached, flush the decompression object
382 # and mark this object as finished.
Tim Peters07e99cb2001-01-14 23:47:14 +0000383
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000384 if buf == b"":
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000385 uncompress = self.decompress.flush()
Antoine Pitrou7b969842010-09-23 16:22:51 +0000386 # Prepend the already read bytes to the fileobj to they can be
387 # seen by _read_eof()
388 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000389 self._read_eof()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000390 self._add_read_data( uncompress )
Collin Winterce36ad82007-08-30 01:19:48 +0000391 raise EOFError('Reached EOF')
Tim Peters07e99cb2001-01-14 23:47:14 +0000392
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000393 uncompress = self.decompress.decompress(buf)
394 self._add_read_data( uncompress )
395
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000396 if self.decompress.unused_data != b"":
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000397 # Ending case: we've come to the end of a member in the file,
398 # so seek back to the start of the unused data, finish up
399 # this member, and read a new gzip header.
Antoine Pitrou7b969842010-09-23 16:22:51 +0000400 # Prepend the already read bytes to the fileobj to they can be
401 # seen by _read_eof() and _read_gzip_header()
402 self.fileobj.prepend(self.decompress.unused_data, True)
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000403 # Check the CRC and file size, and set the flag so we read
Tim Peters07e99cb2001-01-14 23:47:14 +0000404 # a new member on the next call
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000405 self._read_eof()
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000406 self._new_member = True
Tim Peters07e99cb2001-01-14 23:47:14 +0000407
408 def _add_read_data(self, data):
Christian Heimesfe337bf2008-03-23 21:54:12 +0000409 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
Antoine Pitroub1f88352010-01-03 22:37:40 +0000410 offset = self.offset - self.extrastart
411 self.extrabuf = self.extrabuf[offset:] + data
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000412 self.extrasize = self.extrasize + len(data)
Antoine Pitroub1f88352010-01-03 22:37:40 +0000413 self.extrastart = self.offset
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000414 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000415
416 def _read_eof(self):
Antoine Pitrou7b969842010-09-23 16:22:51 +0000417 # We've read to the end of the file
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000418 # We check the that the computed CRC and size of the
Tim Peters9288f952002-11-05 20:38:55 +0000419 # uncompressed data matches the stored values. Note that the size
420 # stored is the true file size mod 2**32.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000421 crc32 = read32(self.fileobj)
Christian Heimesfe337bf2008-03-23 21:54:12 +0000422 isize = read32(self.fileobj) # may exceed 2GB
423 if crc32 != self.crc:
424 raise IOError("CRC check failed %s != %s" % (hex(crc32),
425 hex(self.crc)))
Christian Heimes1dc54002008-03-24 02:19:29 +0000426 elif isize != (self.size & 0xffffffff):
Collin Winterce36ad82007-08-30 01:19:48 +0000427 raise IOError("Incorrect length of data produced")
Tim Peters07e99cb2001-01-14 23:47:14 +0000428
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000429 # Gzip files can be padded with zeroes and still have archives.
430 # Consume all zero bytes and set the file position to the first
431 # non-zero byte. See http://www.gzip.org/#faq8
432 c = b"\x00"
433 while c == b"\x00":
434 c = self.fileobj.read(1)
435 if c:
Antoine Pitrou7b969842010-09-23 16:22:51 +0000436 self.fileobj.prepend(c, True)
Antoine Pitrou8e33fd72010-01-13 14:37:26 +0000437
Antoine Pitroub1f88352010-01-03 22:37:40 +0000438 @property
439 def closed(self):
440 return self.fileobj is None
441
Guido van Rossum15262191997-04-30 16:04:57 +0000442 def close(self):
Georg Brandlb533e262008-05-25 18:19:30 +0000443 if self.fileobj is None:
444 return
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000445 if self.mode == WRITE:
446 self.fileobj.write(self.compress.flush())
Christian Heimesfe337bf2008-03-23 21:54:12 +0000447 write32u(self.fileobj, self.crc)
Tim Peters9288f952002-11-05 20:38:55 +0000448 # self.size may exceed 2GB, or even 4GB
Christian Heimes1dc54002008-03-24 02:19:29 +0000449 write32u(self.fileobj, self.size & 0xffffffff)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000450 self.fileobj = None
451 elif self.mode == READ:
452 self.fileobj = None
453 if self.myfileobj:
454 self.myfileobj.close()
455 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000456
Martin v. Löwisf2a8d632005-03-03 08:35:22 +0000457 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
458 if self.mode == WRITE:
Tim Peterseba28be2005-03-28 01:08:02 +0000459 # Ensure the compressor's buffer is flushed
460 self.fileobj.write(self.compress.flush(zlib_mode))
Mark Dickinsona9eb87a2010-05-04 18:47:04 +0000461 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000462
Tim Peters5cfb05e2004-07-27 21:02:02 +0000463 def fileno(self):
464 """Invoke the underlying file object's fileno() method.
465
466 This will raise AttributeError if the underlying file object
467 doesn't support fileno().
468 """
469 return self.fileobj.fileno()
470
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000471 def rewind(self):
472 '''Return the uncompressed stream file position indicator to the
Tim Petersab9ba272001-08-09 21:40:30 +0000473 beginning of the file'''
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000474 if self.mode != READ:
475 raise IOError("Can't rewind in write mode")
476 self.fileobj.seek(0)
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000477 self._new_member = True
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000478 self.extrabuf = b""
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000479 self.extrasize = 0
Antoine Pitroub1f88352010-01-03 22:37:40 +0000480 self.extrastart = 0
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000481 self.offset = 0
482
Antoine Pitroub1f88352010-01-03 22:37:40 +0000483 def readable(self):
484 return self.mode == READ
485
486 def writable(self):
487 return self.mode == WRITE
488
489 def seekable(self):
490 return True
491
Thomas Wouters89f507f2006-12-13 04:49:30 +0000492 def seek(self, offset, whence=0):
493 if whence:
494 if whence == 1:
495 offset = self.offset + offset
496 else:
497 raise ValueError('Seek from end not supported')
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000498 if self.mode == WRITE:
499 if offset < self.offset:
500 raise IOError('Negative seek in write mode')
501 count = offset - self.offset
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000502 chunk = bytes(1024)
Tim Petersfb0ea522002-11-04 19:50:11 +0000503 for i in range(count // 1024):
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000504 self.write(chunk)
505 self.write(bytes(count % 1024))
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000506 elif self.mode == READ:
507 if offset < self.offset:
508 # for negative seek, rewind and do positive seek
509 self.rewind()
510 count = offset - self.offset
Tim Petersfb0ea522002-11-04 19:50:11 +0000511 for i in range(count // 1024):
512 self.read(1024)
Martin v. Löwis8cc965c2001-08-09 07:21:56 +0000513 self.read(count % 1024)
514
Antoine Pitroub1f88352010-01-03 22:37:40 +0000515 return self.offset
516
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000517 def readline(self, size=-1):
Thomas Wouters477c8d52006-05-27 19:21:47 +0000518 if size < 0:
Antoine Pitroub1f88352010-01-03 22:37:40 +0000519 # Shortcut common case - newline found in buffer.
520 offset = self.offset - self.extrastart
521 i = self.extrabuf.find(b'\n', offset) + 1
522 if i > 0:
523 self.extrasize -= i - offset
524 self.offset += i - offset
525 return self.extrabuf[offset: i]
526
Christian Heimesa37d4c62007-12-04 23:02:19 +0000527 size = sys.maxsize
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528 readsize = self.min_readsize
529 else:
530 readsize = size
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000531 bufs = []
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532 while size != 0:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000533 c = self.read(readsize)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000534 i = c.find(b'\n')
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
536 # We set i=size to break out of the loop under two
537 # conditions: 1) there's no newline, and the chunk is
538 # larger than size, or 2) there is a newline, but the
539 # resulting line would be longer than 'size'.
540 if (size <= i) or (i == -1 and len(c) > size):
541 i = size - 1
Guido van Rossum15262191997-04-30 16:04:57 +0000542
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000543 if i >= 0 or c == b'':
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544 bufs.append(c[:i + 1]) # Add portion of last chunk
545 self._unread(c[i + 1:]) # Push back rest of chunk
546 break
Andrew M. Kuchling41616ee2000-07-29 20:15:26 +0000547
548 # Append chunk to list, decrease 'size',
549 bufs.append(c)
550 size = size - len(c)
551 readsize = min(size, readsize * 2)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 if readsize > self.min_readsize:
553 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
Walter Dörwald5b1284d2007-06-06 16:43:59 +0000554 return b''.join(bufs) # Return resulting line
Tim Peters07e99cb2001-01-14 23:47:14 +0000555
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000556
Antoine Pitrou79c5ef12010-08-17 21:10:05 +0000557def compress(data, compresslevel=9):
558 """Compress data in one shot and return the compressed string.
559 Optional argument is the compression level, in range of 1-9.
560 """
561 buf = io.BytesIO()
562 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
563 f.write(data)
564 return buf.getvalue()
565
566def decompress(data):
567 """Decompress a gzip compressed string in one shot.
568 Return the decompressed string.
569 """
570 with GzipFile(fileobj=io.BytesIO(data)) as f:
571 return f.read()
572
573
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000574def _test():
575 # Act like gzip; with -d, act like gunzip.
576 # The input file is not deleted, however, nor are any other gzip
577 # options or features supported.
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000578 args = sys.argv[1:]
579 decompress = args and args[0] == "-d"
580 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000581 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000582 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000583 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000584 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000585 if decompress:
586 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000587 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
588 g = sys.stdout.buffer
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000589 else:
590 if arg[-3:] != ".gz":
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000591 print("filename doesn't end in .gz:", repr(arg))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000592 continue
593 f = open(arg, "rb")
Georg Brandl1a3284e2007-12-02 09:40:06 +0000594 g = builtins.open(arg[:-3], "wb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000595 else:
596 if arg == "-":
Antoine Pitrou9d625c22009-01-04 21:11:10 +0000597 f = sys.stdin.buffer
598 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000599 else:
Georg Brandl1a3284e2007-12-02 09:40:06 +0000600 f = builtins.open(arg, "rb")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000601 g = open(arg + ".gz", "wb")
Guido van Rossum8ca162f2002-04-07 06:36:23 +0000602 while True:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000603 chunk = f.read(1024)
604 if not chunk:
605 break
606 g.write(chunk)
607 if g is not sys.stdout:
608 g.close()
609 if f is not sys.stdin:
610 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000611
612if __name__ == '__main__':
613 _test()