Blame - python/lib/Lib/gzip.py - platform/tools/idea

blob: 2480414650ada7490bb189798802ebd736415d83 [file] [log] [blame]

Tor Norbye	3a2425a	2013-11-04 10:16:08 -0800	[diff] [blame^]	1	"""Functions that read and write gzipped files.
				2
				3	The user of the file doesn't have to worry about the compression,
				4	but random access is not allowed."""
				5
				6	# based on Andrew Kuchling's minigzip.py distributed with the zlib module
				7
				8	import struct, sys, time
				9	import zlib
				10	import __builtin__
				11
				12	__all__ = ["GzipFile","open"]
				13
				14	FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
				15
				16	READ, WRITE = 1, 2
				17
				18	def U32(i):
				19	"""Return i as an unsigned integer, assuming it fits in 32 bits.
				20
				21	If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
				22	"""
				23	if i < 0:
				24	i += 1L << 32
				25	return i
				26
				27	def LOWU32(i):
				28	"""Return the low-order 32 bits of an int, as a non-negative int."""
				29	return i & 0xFFFFFFFFL
				30
				31	def write32(output, value):
				32	output.write(struct.pack("<l", value))
				33
				34	def write32u(output, value):
				35	# The L format writes the bit pattern correctly whether signed
				36	# or unsigned.
				37	output.write(struct.pack("<L", value))
				38
				39	def read32(input):
				40	return struct.unpack("<l", input.read(4))[0]
				41
				42	def open(filename, mode="rb", compresslevel=9):
				43	"""Shorthand for GzipFile(filename, mode, compresslevel).
				44
				45	The filename argument is required; mode defaults to 'rb'
				46	and compresslevel defaults to 9.
				47
				48	"""
				49	return GzipFile(filename, mode, compresslevel)
				50
				51	class GzipFile:
				52	"""The GzipFile class simulates most of the methods of a file object with
				53	the exception of the readinto() and truncate() methods.
				54
				55	"""
				56
				57	myfileobj = None
				58	# XXX: repeated 10mb chunk reads hurt test_gzip.test_many_append's
				59	# performance on Jython (maybe CPython's allocator recycles the same
				60	# 10mb buffer whereas Java's doesn't)
				61	#max_read_chunk = 10 * 1024 * 1024 # 10Mb
				62	max_read_chunk = 256 * 1024 # 256kb
				63
				64	def __init__(self, filename=None, mode=None,
				65	compresslevel=9, fileobj=None):
				66	"""Constructor for the GzipFile class.
				67
				68	At least one of fileobj and filename must be given a
				69	non-trivial value.
				70
				71	The new class instance is based on fileobj, which can be a regular
				72	file, a StringIO object, or any other object which simulates a file.
				73	It defaults to None, in which case filename is opened to provide
				74	a file object.
				75
				76	When fileobj is not None, the filename argument is only used to be
				77	included in the gzip file header, which may includes the original
				78	filename of the uncompressed file. It defaults to the filename of
				79	fileobj, if discernible; otherwise, it defaults to the empty string,
				80	and in this case the original filename is not included in the header.
				81
				82	The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
				83	depending on whether the file will be read or written. The default
				84	is the mode of fileobj if discernible; otherwise, the default is 'rb'.
				85	Be aware that only the 'rb', 'ab', and 'wb' values should be used
				86	for cross-platform portability.
				87
				88	The compresslevel argument is an integer from 1 to 9 controlling the
				89	level of compression; 1 is fastest and produces the least compression,
				90	and 9 is slowest and produces the most compression. The default is 9.
				91
				92	"""
				93
				94	# guarantee the file is opened in binary mode on platforms
				95	# that care about that sort of thing
				96	if mode and 'b' not in mode:
				97	mode += 'b'
				98	if fileobj is None:
				99	fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
				100	if filename is None:
				101	if hasattr(fileobj, 'name'): filename = fileobj.name
				102	else: filename = ''
				103	if mode is None:
				104	if hasattr(fileobj, 'mode'): mode = fileobj.mode
				105	else: mode = 'rb'
				106
				107	if mode[0:1] == 'r':
				108	self.mode = READ
				109	# Set flag indicating start of a new member
				110	self._new_member = True
				111	self.extrabuf = ""
				112	self.extrasize = 0
				113	self.filename = filename
				114	# Starts small, scales exponentially
				115	self.min_readsize = 100
				116
				117	elif mode[0:1] == 'w' or mode[0:1] == 'a':
				118	self.mode = WRITE
				119	self._init_write(filename)
				120	self.compress = zlib.compressobj(compresslevel,
				121	zlib.DEFLATED,
				122	-zlib.MAX_WBITS,
				123	zlib.DEF_MEM_LEVEL,
				124	0)
				125	else:
				126	raise IOError, "Mode " + mode + " not supported"
				127
				128	self.fileobj = fileobj
				129	self.offset = 0
				130
				131	if self.mode == WRITE:
				132	self._write_gzip_header()
				133
				134	def __repr__(self):
				135	s = repr(self.fileobj)
				136	return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
				137
				138	def _init_write(self, filename):
				139	if filename[-3:] != '.gz':
				140	filename = filename + '.gz'
				141	self.filename = filename
				142	self.crc = zlib.crc32("")
				143	self.size = 0
				144	self.writebuf = []
				145	self.bufsize = 0
				146
				147	def _write_gzip_header(self):
				148	self.fileobj.write('\037\213') # magic header
				149	self.fileobj.write('\010') # compression method
				150	fname = self.filename[:-3]
				151	flags = 0
				152	if fname:
				153	flags = FNAME
				154	self.fileobj.write(chr(flags))
				155	write32u(self.fileobj, long(time.time()))
				156	self.fileobj.write('\002')
				157	self.fileobj.write('\377')
				158	if fname:
				159	self.fileobj.write(fname + '\000')
				160
				161	def _init_read(self):
				162	self.crc = zlib.crc32("")
				163	self.size = 0
				164
				165	def _read_gzip_header(self):
				166	magic = self.fileobj.read(2)
				167	if magic != '\037\213':
				168	raise IOError, 'Not a gzipped file'
				169	method = ord( self.fileobj.read(1) )
				170	if method != 8:
				171	raise IOError, 'Unknown compression method'
				172	flag = ord( self.fileobj.read(1) )
				173	# modtime = self.fileobj.read(4)
				174	# extraflag = self.fileobj.read(1)
				175	# os = self.fileobj.read(1)
				176	self.fileobj.read(6)
				177
				178	if flag & FEXTRA:
				179	# Read & discard the extra field, if present
				180	xlen = ord(self.fileobj.read(1))
				181	xlen = xlen + 256*ord(self.fileobj.read(1))
				182	self.fileobj.read(xlen)
				183	if flag & FNAME:
				184	# Read and discard a null-terminated string containing the filename
				185	while True:
				186	s = self.fileobj.read(1)
				187	if not s or s=='\000':
				188	break
				189	if flag & FCOMMENT:
				190	# Read and discard a null-terminated string containing a comment
				191	while True:
				192	s = self.fileobj.read(1)
				193	if not s or s=='\000':
				194	break
				195	if flag & FHCRC:
				196	self.fileobj.read(2) # Read & discard the 16-bit header CRC
				197
				198
				199	def write(self,data):
				200	if self.mode != WRITE:
				201	import errno
				202	raise IOError(errno.EBADF, "write() on read-only GzipFile object")
				203
				204	if self.fileobj is None:
				205	raise ValueError, "write() on closed GzipFile object"
				206	if len(data) > 0:
				207	self.size = self.size + len(data)
				208	self.crc = zlib.crc32(data, self.crc)
				209	self.fileobj.write( self.compress.compress(data) )
				210	self.offset += len(data)
				211
				212	def read(self, size=-1):
				213	if self.mode != READ:
				214	import errno
				215	raise IOError(errno.EBADF, "read() on write-only GzipFile object")
				216
				217	if self.extrasize <= 0 and self.fileobj is None:
				218	return ''
				219
				220	readsize = 1024
				221	if size < 0: # get the whole thing
				222	try:
				223	while True:
				224	self._read(readsize)
				225	readsize = min(self.max_read_chunk, readsize * 2)
				226	except EOFError:
				227	size = self.extrasize
				228	else: # just get some more of it
				229	try:
				230	while size > self.extrasize:
				231	self._read(readsize)
				232	readsize = min(self.max_read_chunk, readsize * 2)
				233	except EOFError:
				234	if size > self.extrasize:
				235	size = self.extrasize
				236
				237	chunk = self.extrabuf[:size]
				238	self.extrabuf = self.extrabuf[size:]
				239	self.extrasize = self.extrasize - size
				240
				241	self.offset += size
				242	return chunk
				243
				244	def _unread(self, buf):
				245	self.extrabuf = buf + self.extrabuf
				246	self.extrasize = len(buf) + self.extrasize
				247	self.offset -= len(buf)
				248
				249	def _read(self, size=1024):
				250	if self.fileobj is None:
				251	raise EOFError, "Reached EOF"
				252
				253	if self._new_member:
				254	# If the _new_member flag is set, we have to
				255	# jump to the next member, if there is one.
				256	#
				257	# First, check if we're at the end of the file;
				258	# if so, it's time to stop; no more members to read.
				259	pos = self.fileobj.tell() # Save current position
				260	self.fileobj.seek(0, 2) # Seek to end of file
				261	if pos == self.fileobj.tell():
				262	raise EOFError, "Reached EOF"
				263	else:
				264	self.fileobj.seek( pos ) # Return to original position
				265
				266	self._init_read()
				267	self._read_gzip_header()
				268	self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
				269	self._new_member = False
				270
				271	# Read a chunk of data from the file
				272	buf = self.fileobj.read(size)
				273
				274	# If the EOF has been reached, flush the decompression object
				275	# and mark this object as finished.
				276
				277	if buf == "":
				278	uncompress = self.decompress.flush()
				279	self._read_eof()
				280	self._add_read_data( uncompress )
				281	raise EOFError, 'Reached EOF'
				282
				283	uncompress = self.decompress.decompress(buf)
				284	self._add_read_data( uncompress )
				285
				286	if self.decompress.unused_data != "":
				287	# Ending case: we've come to the end of a member in the file,
				288	# so seek back to the start of the unused data, finish up
				289	# this member, and read a new gzip header.
				290	# (The number of bytes to seek back is the length of the unused
				291	# data, minus 8 because _read_eof() will rewind a further 8 bytes)
				292	self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
				293
				294	# Check the CRC and file size, and set the flag so we read
				295	# a new member on the next call
				296	self._read_eof()
				297	self._new_member = True
				298
				299	def _add_read_data(self, data):
				300	self.crc = zlib.crc32(data, self.crc)
				301	self.extrabuf = self.extrabuf + data
				302	self.extrasize = self.extrasize + len(data)
				303	self.size = self.size + len(data)
				304
				305	def _read_eof(self):
				306	# We've read to the end of the file, so we have to rewind in order
				307	# to reread the 8 bytes containing the CRC and the file size.
				308	# We check the that the computed CRC and size of the
				309	# uncompressed data matches the stored values. Note that the size
				310	# stored is the true file size mod 2**32.
				311	self.fileobj.seek(-8, 1)
				312	crc32 = read32(self.fileobj)
				313	isize = U32(read32(self.fileobj)) # may exceed 2GB
				314	if U32(crc32) != U32(self.crc):
				315	raise IOError, "CRC check failed"
				316	elif isize != LOWU32(self.size):
				317	raise IOError, "Incorrect length of data produced"
				318
				319	def close(self):
				320	if self.mode == WRITE:
				321	self.fileobj.write(self.compress.flush())
				322	# The native zlib crc is an unsigned 32-bit integer, but
				323	# the Python wrapper implicitly casts that to a signed C
				324	# long. So, on a 32-bit box self.crc may "look negative",
				325	# while the same crc on a 64-bit box may "look positive".
				326	# To avoid irksome warnings from the `struct` module, force
				327	# it to look positive on all boxes.
				328	write32u(self.fileobj, LOWU32(self.crc))
				329	# self.size may exceed 2GB, or even 4GB
				330	write32u(self.fileobj, LOWU32(self.size))
				331	self.fileobj = None
				332	elif self.mode == READ:
				333	self.fileobj = None
				334	if self.myfileobj:
				335	self.myfileobj.close()
				336	self.myfileobj = None
				337
				338	def __del__(self):
				339	try:
				340	if (self.myfileobj is None and
				341	self.fileobj is None):
				342	return
				343	except AttributeError:
				344	return
				345	self.close()
				346
				347	if not sys.platform.startswith('java'):
				348	def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
				349	if self.mode == WRITE:
				350	# Ensure the compressor's buffer is flushed
				351	self.fileobj.write(self.compress.flush(zlib_mode))
				352	self.fileobj.flush()
				353	else:
				354	# Java lacks Z_SYNC_FLUSH; thus Jython can't flush the
				355	# compressobj until EOF
				356	def flush(self,zlib_mode=None):
				357	self.fileobj.flush()
				358
				359	def fileno(self):
				360	"""Invoke the underlying file object's fileno() method.
				361
				362	This will raise AttributeError if the underlying file object
				363	doesn't support fileno().
				364	"""
				365	return self.fileobj.fileno()
				366
				367	def isatty(self):
				368	return False
				369
				370	def tell(self):
				371	return self.offset
				372
				373	def rewind(self):
				374	'''Return the uncompressed stream file position indicator to the
				375	beginning of the file'''
				376	if self.mode != READ:
				377	raise IOError("Can't rewind in write mode")
				378	self.fileobj.seek(0)
				379	self._new_member = True
				380	self.extrabuf = ""
				381	self.extrasize = 0
				382	self.offset = 0
				383
				384	def seek(self, offset):
				385	if self.mode == WRITE:
				386	if offset < self.offset:
				387	raise IOError('Negative seek in write mode')
				388	count = offset - self.offset
				389	for i in range(count // 1024):
				390	self.write(1024 * '\0')
				391	self.write((count % 1024) * '\0')
				392	elif self.mode == READ:
				393	if offset < self.offset:
				394	# for negative seek, rewind and do positive seek
				395	self.rewind()
				396	count = offset - self.offset
				397	for i in range(count // 1024):
				398	self.read(1024)
				399	self.read(count % 1024)
				400
				401	def readline(self, size=-1):
				402	if size < 0:
				403	size = sys.maxint
				404	readsize = self.min_readsize
				405	else:
				406	readsize = size
				407	bufs = []
				408	while size != 0:
				409	c = self.read(readsize)
				410	i = c.find('\n')
				411
				412	# We set i=size to break out of the loop under two
				413	# conditions: 1) there's no newline, and the chunk is
				414	# larger than size, or 2) there is a newline, but the
				415	# resulting line would be longer than 'size'.
				416	if (size <= i) or (i == -1 and len(c) > size):
				417	i = size - 1
				418
				419	if i >= 0 or c == '':
				420	bufs.append(c[:i + 1]) # Add portion of last chunk
				421	self._unread(c[i + 1:]) # Push back rest of chunk
				422	break
				423
				424	# Append chunk to list, decrease 'size',
				425	bufs.append(c)
				426	size = size - len(c)
				427	readsize = min(size, readsize * 2)
				428	if readsize > self.min_readsize:
				429	self.min_readsize = min(readsize, self.min_readsize * 2, 512)
				430	return ''.join(bufs) # Return resulting line
				431
				432	def readlines(self, sizehint=0):
				433	# Negative numbers result in reading all the lines
				434	if sizehint <= 0:
				435	sizehint = sys.maxint
				436	L = []
				437	while sizehint > 0:
				438	line = self.readline()
				439	if line == "":
				440	break
				441	L.append(line)
				442	sizehint = sizehint - len(line)
				443
				444	return L
				445
				446	def writelines(self, L):
				447	for line in L:
				448	self.write(line)
				449
				450	def __iter__(self):
				451	return self
				452
				453	def next(self):
				454	line = self.readline()
				455	if line:
				456	return line
				457	else:
				458	raise StopIteration
				459
				460
				461	def _test():
				462	# Act like gzip; with -d, act like gunzip.
				463	# The input file is not deleted, however, nor are any other gzip
				464	# options or features supported.
				465	args = sys.argv[1:]
				466	decompress = args and args[0] == "-d"
				467	if decompress:
				468	args = args[1:]
				469	if not args:
				470	args = ["-"]
				471	for arg in args:
				472	if decompress:
				473	if arg == "-":
				474	f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
				475	g = sys.stdout
				476	else:
				477	if arg[-3:] != ".gz":
				478	print "filename doesn't end in .gz:", repr(arg)
				479	continue
				480	f = open(arg, "rb")
				481	g = __builtin__.open(arg[:-3], "wb")
				482	else:
				483	if arg == "-":
				484	f = sys.stdin
				485	g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
				486	else:
				487	f = __builtin__.open(arg, "rb")
				488	g = open(arg + ".gz", "wb")
				489	while True:
				490	chunk = f.read(1024)
				491	if not chunk:
				492	break
				493	g.write(chunk)
				494	if g is not sys.stdout:
				495	g.close()
				496	if f is not sys.stdin:
				497	f.close()
				498
				499	if __name__ == '__main__':
				500	_test()