blob: 2480414650ada7490bb189798802ebd736415d83 [file] [log] [blame]
Tor Norbye3a2425a2013-11-04 10:16:08 -08001"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time
9import zlib
10import __builtin__
11
12__all__ = ["GzipFile","open"]
13
14FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
15
16READ, WRITE = 1, 2
17
18def U32(i):
19 """Return i as an unsigned integer, assuming it fits in 32 bits.
20
21 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
24 i += 1L << 32
25 return i
26
27def LOWU32(i):
28 """Return the low-order 32 bits of an int, as a non-negative int."""
29 return i & 0xFFFFFFFFL
30
31def write32(output, value):
32 output.write(struct.pack("<l", value))
33
34def write32u(output, value):
35 # The L format writes the bit pattern correctly whether signed
36 # or unsigned.
37 output.write(struct.pack("<L", value))
38
39def read32(input):
40 return struct.unpack("<l", input.read(4))[0]
41
42def open(filename, mode="rb", compresslevel=9):
43 """Shorthand for GzipFile(filename, mode, compresslevel).
44
45 The filename argument is required; mode defaults to 'rb'
46 and compresslevel defaults to 9.
47
48 """
49 return GzipFile(filename, mode, compresslevel)
50
51class GzipFile:
52 """The GzipFile class simulates most of the methods of a file object with
53 the exception of the readinto() and truncate() methods.
54
55 """
56
57 myfileobj = None
58 # XXX: repeated 10mb chunk reads hurt test_gzip.test_many_append's
59 # performance on Jython (maybe CPython's allocator recycles the same
60 # 10mb buffer whereas Java's doesn't)
61 #max_read_chunk = 10 * 1024 * 1024 # 10Mb
62 max_read_chunk = 256 * 1024 # 256kb
63
64 def __init__(self, filename=None, mode=None,
65 compresslevel=9, fileobj=None):
66 """Constructor for the GzipFile class.
67
68 At least one of fileobj and filename must be given a
69 non-trivial value.
70
71 The new class instance is based on fileobj, which can be a regular
72 file, a StringIO object, or any other object which simulates a file.
73 It defaults to None, in which case filename is opened to provide
74 a file object.
75
76 When fileobj is not None, the filename argument is only used to be
77 included in the gzip file header, which may includes the original
78 filename of the uncompressed file. It defaults to the filename of
79 fileobj, if discernible; otherwise, it defaults to the empty string,
80 and in this case the original filename is not included in the header.
81
82 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
83 depending on whether the file will be read or written. The default
84 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
85 Be aware that only the 'rb', 'ab', and 'wb' values should be used
86 for cross-platform portability.
87
88 The compresslevel argument is an integer from 1 to 9 controlling the
89 level of compression; 1 is fastest and produces the least compression,
90 and 9 is slowest and produces the most compression. The default is 9.
91
92 """
93
94 # guarantee the file is opened in binary mode on platforms
95 # that care about that sort of thing
96 if mode and 'b' not in mode:
97 mode += 'b'
98 if fileobj is None:
99 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
100 if filename is None:
101 if hasattr(fileobj, 'name'): filename = fileobj.name
102 else: filename = ''
103 if mode is None:
104 if hasattr(fileobj, 'mode'): mode = fileobj.mode
105 else: mode = 'rb'
106
107 if mode[0:1] == 'r':
108 self.mode = READ
109 # Set flag indicating start of a new member
110 self._new_member = True
111 self.extrabuf = ""
112 self.extrasize = 0
113 self.filename = filename
114 # Starts small, scales exponentially
115 self.min_readsize = 100
116
117 elif mode[0:1] == 'w' or mode[0:1] == 'a':
118 self.mode = WRITE
119 self._init_write(filename)
120 self.compress = zlib.compressobj(compresslevel,
121 zlib.DEFLATED,
122 -zlib.MAX_WBITS,
123 zlib.DEF_MEM_LEVEL,
124 0)
125 else:
126 raise IOError, "Mode " + mode + " not supported"
127
128 self.fileobj = fileobj
129 self.offset = 0
130
131 if self.mode == WRITE:
132 self._write_gzip_header()
133
134 def __repr__(self):
135 s = repr(self.fileobj)
136 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
137
138 def _init_write(self, filename):
139 if filename[-3:] != '.gz':
140 filename = filename + '.gz'
141 self.filename = filename
142 self.crc = zlib.crc32("")
143 self.size = 0
144 self.writebuf = []
145 self.bufsize = 0
146
147 def _write_gzip_header(self):
148 self.fileobj.write('\037\213') # magic header
149 self.fileobj.write('\010') # compression method
150 fname = self.filename[:-3]
151 flags = 0
152 if fname:
153 flags = FNAME
154 self.fileobj.write(chr(flags))
155 write32u(self.fileobj, long(time.time()))
156 self.fileobj.write('\002')
157 self.fileobj.write('\377')
158 if fname:
159 self.fileobj.write(fname + '\000')
160
161 def _init_read(self):
162 self.crc = zlib.crc32("")
163 self.size = 0
164
165 def _read_gzip_header(self):
166 magic = self.fileobj.read(2)
167 if magic != '\037\213':
168 raise IOError, 'Not a gzipped file'
169 method = ord( self.fileobj.read(1) )
170 if method != 8:
171 raise IOError, 'Unknown compression method'
172 flag = ord( self.fileobj.read(1) )
173 # modtime = self.fileobj.read(4)
174 # extraflag = self.fileobj.read(1)
175 # os = self.fileobj.read(1)
176 self.fileobj.read(6)
177
178 if flag & FEXTRA:
179 # Read & discard the extra field, if present
180 xlen = ord(self.fileobj.read(1))
181 xlen = xlen + 256*ord(self.fileobj.read(1))
182 self.fileobj.read(xlen)
183 if flag & FNAME:
184 # Read and discard a null-terminated string containing the filename
185 while True:
186 s = self.fileobj.read(1)
187 if not s or s=='\000':
188 break
189 if flag & FCOMMENT:
190 # Read and discard a null-terminated string containing a comment
191 while True:
192 s = self.fileobj.read(1)
193 if not s or s=='\000':
194 break
195 if flag & FHCRC:
196 self.fileobj.read(2) # Read & discard the 16-bit header CRC
197
198
199 def write(self,data):
200 if self.mode != WRITE:
201 import errno
202 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
203
204 if self.fileobj is None:
205 raise ValueError, "write() on closed GzipFile object"
206 if len(data) > 0:
207 self.size = self.size + len(data)
208 self.crc = zlib.crc32(data, self.crc)
209 self.fileobj.write( self.compress.compress(data) )
210 self.offset += len(data)
211
212 def read(self, size=-1):
213 if self.mode != READ:
214 import errno
215 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
216
217 if self.extrasize <= 0 and self.fileobj is None:
218 return ''
219
220 readsize = 1024
221 if size < 0: # get the whole thing
222 try:
223 while True:
224 self._read(readsize)
225 readsize = min(self.max_read_chunk, readsize * 2)
226 except EOFError:
227 size = self.extrasize
228 else: # just get some more of it
229 try:
230 while size > self.extrasize:
231 self._read(readsize)
232 readsize = min(self.max_read_chunk, readsize * 2)
233 except EOFError:
234 if size > self.extrasize:
235 size = self.extrasize
236
237 chunk = self.extrabuf[:size]
238 self.extrabuf = self.extrabuf[size:]
239 self.extrasize = self.extrasize - size
240
241 self.offset += size
242 return chunk
243
244 def _unread(self, buf):
245 self.extrabuf = buf + self.extrabuf
246 self.extrasize = len(buf) + self.extrasize
247 self.offset -= len(buf)
248
249 def _read(self, size=1024):
250 if self.fileobj is None:
251 raise EOFError, "Reached EOF"
252
253 if self._new_member:
254 # If the _new_member flag is set, we have to
255 # jump to the next member, if there is one.
256 #
257 # First, check if we're at the end of the file;
258 # if so, it's time to stop; no more members to read.
259 pos = self.fileobj.tell() # Save current position
260 self.fileobj.seek(0, 2) # Seek to end of file
261 if pos == self.fileobj.tell():
262 raise EOFError, "Reached EOF"
263 else:
264 self.fileobj.seek( pos ) # Return to original position
265
266 self._init_read()
267 self._read_gzip_header()
268 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
269 self._new_member = False
270
271 # Read a chunk of data from the file
272 buf = self.fileobj.read(size)
273
274 # If the EOF has been reached, flush the decompression object
275 # and mark this object as finished.
276
277 if buf == "":
278 uncompress = self.decompress.flush()
279 self._read_eof()
280 self._add_read_data( uncompress )
281 raise EOFError, 'Reached EOF'
282
283 uncompress = self.decompress.decompress(buf)
284 self._add_read_data( uncompress )
285
286 if self.decompress.unused_data != "":
287 # Ending case: we've come to the end of a member in the file,
288 # so seek back to the start of the unused data, finish up
289 # this member, and read a new gzip header.
290 # (The number of bytes to seek back is the length of the unused
291 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
292 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
293
294 # Check the CRC and file size, and set the flag so we read
295 # a new member on the next call
296 self._read_eof()
297 self._new_member = True
298
299 def _add_read_data(self, data):
300 self.crc = zlib.crc32(data, self.crc)
301 self.extrabuf = self.extrabuf + data
302 self.extrasize = self.extrasize + len(data)
303 self.size = self.size + len(data)
304
305 def _read_eof(self):
306 # We've read to the end of the file, so we have to rewind in order
307 # to reread the 8 bytes containing the CRC and the file size.
308 # We check the that the computed CRC and size of the
309 # uncompressed data matches the stored values. Note that the size
310 # stored is the true file size mod 2**32.
311 self.fileobj.seek(-8, 1)
312 crc32 = read32(self.fileobj)
313 isize = U32(read32(self.fileobj)) # may exceed 2GB
314 if U32(crc32) != U32(self.crc):
315 raise IOError, "CRC check failed"
316 elif isize != LOWU32(self.size):
317 raise IOError, "Incorrect length of data produced"
318
319 def close(self):
320 if self.mode == WRITE:
321 self.fileobj.write(self.compress.flush())
322 # The native zlib crc is an unsigned 32-bit integer, but
323 # the Python wrapper implicitly casts that to a signed C
324 # long. So, on a 32-bit box self.crc may "look negative",
325 # while the same crc on a 64-bit box may "look positive".
326 # To avoid irksome warnings from the `struct` module, force
327 # it to look positive on all boxes.
328 write32u(self.fileobj, LOWU32(self.crc))
329 # self.size may exceed 2GB, or even 4GB
330 write32u(self.fileobj, LOWU32(self.size))
331 self.fileobj = None
332 elif self.mode == READ:
333 self.fileobj = None
334 if self.myfileobj:
335 self.myfileobj.close()
336 self.myfileobj = None
337
338 def __del__(self):
339 try:
340 if (self.myfileobj is None and
341 self.fileobj is None):
342 return
343 except AttributeError:
344 return
345 self.close()
346
347 if not sys.platform.startswith('java'):
348 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
349 if self.mode == WRITE:
350 # Ensure the compressor's buffer is flushed
351 self.fileobj.write(self.compress.flush(zlib_mode))
352 self.fileobj.flush()
353 else:
354 # Java lacks Z_SYNC_FLUSH; thus Jython can't flush the
355 # compressobj until EOF
356 def flush(self,zlib_mode=None):
357 self.fileobj.flush()
358
359 def fileno(self):
360 """Invoke the underlying file object's fileno() method.
361
362 This will raise AttributeError if the underlying file object
363 doesn't support fileno().
364 """
365 return self.fileobj.fileno()
366
367 def isatty(self):
368 return False
369
370 def tell(self):
371 return self.offset
372
373 def rewind(self):
374 '''Return the uncompressed stream file position indicator to the
375 beginning of the file'''
376 if self.mode != READ:
377 raise IOError("Can't rewind in write mode")
378 self.fileobj.seek(0)
379 self._new_member = True
380 self.extrabuf = ""
381 self.extrasize = 0
382 self.offset = 0
383
384 def seek(self, offset):
385 if self.mode == WRITE:
386 if offset < self.offset:
387 raise IOError('Negative seek in write mode')
388 count = offset - self.offset
389 for i in range(count // 1024):
390 self.write(1024 * '\0')
391 self.write((count % 1024) * '\0')
392 elif self.mode == READ:
393 if offset < self.offset:
394 # for negative seek, rewind and do positive seek
395 self.rewind()
396 count = offset - self.offset
397 for i in range(count // 1024):
398 self.read(1024)
399 self.read(count % 1024)
400
401 def readline(self, size=-1):
402 if size < 0:
403 size = sys.maxint
404 readsize = self.min_readsize
405 else:
406 readsize = size
407 bufs = []
408 while size != 0:
409 c = self.read(readsize)
410 i = c.find('\n')
411
412 # We set i=size to break out of the loop under two
413 # conditions: 1) there's no newline, and the chunk is
414 # larger than size, or 2) there is a newline, but the
415 # resulting line would be longer than 'size'.
416 if (size <= i) or (i == -1 and len(c) > size):
417 i = size - 1
418
419 if i >= 0 or c == '':
420 bufs.append(c[:i + 1]) # Add portion of last chunk
421 self._unread(c[i + 1:]) # Push back rest of chunk
422 break
423
424 # Append chunk to list, decrease 'size',
425 bufs.append(c)
426 size = size - len(c)
427 readsize = min(size, readsize * 2)
428 if readsize > self.min_readsize:
429 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
430 return ''.join(bufs) # Return resulting line
431
432 def readlines(self, sizehint=0):
433 # Negative numbers result in reading all the lines
434 if sizehint <= 0:
435 sizehint = sys.maxint
436 L = []
437 while sizehint > 0:
438 line = self.readline()
439 if line == "":
440 break
441 L.append(line)
442 sizehint = sizehint - len(line)
443
444 return L
445
446 def writelines(self, L):
447 for line in L:
448 self.write(line)
449
450 def __iter__(self):
451 return self
452
453 def next(self):
454 line = self.readline()
455 if line:
456 return line
457 else:
458 raise StopIteration
459
460
461def _test():
462 # Act like gzip; with -d, act like gunzip.
463 # The input file is not deleted, however, nor are any other gzip
464 # options or features supported.
465 args = sys.argv[1:]
466 decompress = args and args[0] == "-d"
467 if decompress:
468 args = args[1:]
469 if not args:
470 args = ["-"]
471 for arg in args:
472 if decompress:
473 if arg == "-":
474 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
475 g = sys.stdout
476 else:
477 if arg[-3:] != ".gz":
478 print "filename doesn't end in .gz:", repr(arg)
479 continue
480 f = open(arg, "rb")
481 g = __builtin__.open(arg[:-3], "wb")
482 else:
483 if arg == "-":
484 f = sys.stdin
485 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
486 else:
487 f = __builtin__.open(arg, "rb")
488 g = open(arg + ".gz", "wb")
489 while True:
490 chunk = f.read(1024)
491 if not chunk:
492 break
493 g.write(chunk)
494 if g is not sys.stdout:
495 g.close()
496 if f is not sys.stdin:
497 f.close()
498
499if __name__ == '__main__':
500 _test()