blob: c0179a4716e12850d32c2c4752f583b510502b52 [file] [log] [blame]
Guido van Rossum15262191997-04-30 16:04:57 +00001import time
2import string
3import zlib
Jeremy Hyltonc19f9971999-03-23 23:05:34 +00004import struct
Guido van Rossum68de3791997-07-19 20:22:23 +00005import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +00006
7# implements a python function that reads and writes a gzipped file
8# the user of the file doesn't have to worry about the compression,
Guido van Rossum51ca6e31997-12-30 20:09:08 +00009# but random access is not allowed
Guido van Rossum15262191997-04-30 16:04:57 +000010
11# based on Andrew Kuchling's minigzip.py distributed with the zlib module
12
13FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
14
15READ, WRITE = 1, 2
16
17def write32(output, value):
Jeremy Hyltonc19f9971999-03-23 23:05:34 +000018 output.write(struct.pack("<l", value))
Guido van Rossum15262191997-04-30 16:04:57 +000019
20def read32(input):
Jeremy Hyltonc19f9971999-03-23 23:05:34 +000021 return struct.unpack("<l", input.read(4))[0]
Guido van Rossum15262191997-04-30 16:04:57 +000022
Guido van Rossum68de3791997-07-19 20:22:23 +000023def open(filename, mode="r", compresslevel=9):
Guido van Rossum15262191997-04-30 16:04:57 +000024 return GzipFile(filename, mode, compresslevel)
25
26class GzipFile:
27
Guido van Rossum68de3791997-07-19 20:22:23 +000028 myfileobj = None
29
30 def __init__(self, filename=None, mode=None,
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000031 compresslevel=9, fileobj=None):
32 if fileobj is None:
33 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'r')
Guido van Rossum68de3791997-07-19 20:22:23 +000034 if filename is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000035 if hasattr(fileobj, 'name'): filename = fileobj.name
36 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +000037 if mode is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000038 if hasattr(fileobj, 'mode'): mode = fileobj.mode
39 else: mode = 'r'
Guido van Rossum68de3791997-07-19 20:22:23 +000040
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000041 if mode[0:1] == 'r':
42 self.mode = READ
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +000043 # Set flag indicating start of a new member
44 self._new_member = 1
45 self.extrabuf = ""
46 self.extrasize = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000047 self.filename = filename
Guido van Rossum15262191997-04-30 16:04:57 +000048
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +000049 elif mode[0:1] == 'w' or mode[0:1] == 'a':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000050 self.mode = WRITE
51 self._init_write(filename)
52 self.compress = zlib.compressobj(compresslevel,
53 zlib.DEFLATED,
54 -zlib.MAX_WBITS,
55 zlib.DEF_MEM_LEVEL,
56 0)
57 else:
58 raise ValueError, "Mode " + mode + " not supported"
Guido van Rossum15262191997-04-30 16:04:57 +000059
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000060 self.fileobj = fileobj
Guido van Rossum15262191997-04-30 16:04:57 +000061
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000062 if self.mode == WRITE:
63 self._write_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +000064
65 def __repr__(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000066 s = repr(self.fileobj)
67 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
Guido van Rossum15262191997-04-30 16:04:57 +000068
69 def _init_write(self, filename):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000070 if filename[-3:] != '.gz':
71 filename = filename + '.gz'
72 self.filename = filename
73 self.crc = zlib.crc32("")
74 self.size = 0
75 self.writebuf = []
76 self.bufsize = 0
Guido van Rossum15262191997-04-30 16:04:57 +000077
78 def _write_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000079 self.fileobj.write('\037\213') # magic header
80 self.fileobj.write('\010') # compression method
81 fname = self.filename[:-3]
82 flags = 0
83 if fname:
84 flags = FNAME
85 self.fileobj.write(chr(flags))
86 write32(self.fileobj, int(time.time()))
87 self.fileobj.write('\002')
88 self.fileobj.write('\377')
89 if fname:
90 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +000091
92 def _init_read(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 self.crc = zlib.crc32("")
94 self.size = 0
Guido van Rossum15262191997-04-30 16:04:57 +000095
96 def _read_gzip_header(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000097 magic = self.fileobj.read(2)
98 if magic != '\037\213':
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +000099 raise IOError, 'Not a gzipped file'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000100 method = ord( self.fileobj.read(1) )
101 if method != 8:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000102 raise IOError, 'Unknown compression method'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 flag = ord( self.fileobj.read(1) )
104 # modtime = self.fileobj.read(4)
105 # extraflag = self.fileobj.read(1)
106 # os = self.fileobj.read(1)
107 self.fileobj.read(6)
Guido van Rossum15262191997-04-30 16:04:57 +0000108
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000109 if flag & FEXTRA:
110 # Read & discard the extra field, if present
111 xlen=ord(self.fileobj.read(1))
112 xlen=xlen+256*ord(self.fileobj.read(1))
113 self.fileobj.read(xlen)
114 if flag & FNAME:
115 # Read and discard a null-terminated string containing the filename
116 while (1):
117 s=self.fileobj.read(1)
118 if not s or s=='\000': break
119 if flag & FCOMMENT:
120 # Read and discard a null-terminated string containing a comment
121 while (1):
122 s=self.fileobj.read(1)
123 if not s or s=='\000': break
124 if flag & FHCRC:
125 self.fileobj.read(2) # Read & discard the 16-bit header CRC
Guido van Rossum15262191997-04-30 16:04:57 +0000126
127
128 def write(self,data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000129 if self.fileobj is None:
130 raise ValueError, "write() on closed GzipFile object"
131 if len(data) > 0:
132 self.size = self.size + len(data)
133 self.crc = zlib.crc32(data, self.crc)
134 self.fileobj.write( self.compress.compress(data) )
Guido van Rossum15262191997-04-30 16:04:57 +0000135
136 def writelines(self,lines):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000137 self.write(string.join(lines))
Guido van Rossum15262191997-04-30 16:04:57 +0000138
Jeremy Hyltonee918cb1998-05-13 21:49:58 +0000139 def read(self, size=None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000140 if self.extrasize <= 0 and self.fileobj is None:
141 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000142
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000143 readsize = 1024
144 if not size: # get the whole thing
145 try:
146 while 1:
147 self._read(readsize)
148 readsize = readsize * 2
149 except EOFError:
150 size = self.extrasize
151 else: # just get some more of it
152 try:
153 while size > self.extrasize:
154 self._read(readsize)
155 readsize = readsize * 2
156 except EOFError:
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000157 if size > self.extrasize:
158 size = self.extrasize
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159
160 chunk = self.extrabuf[:size]
161 self.extrabuf = self.extrabuf[size:]
162 self.extrasize = self.extrasize - size
Guido van Rossum15262191997-04-30 16:04:57 +0000163
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000164 return chunk
Guido van Rossum15262191997-04-30 16:04:57 +0000165
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000166 def _unread(self, buf):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000167 self.extrabuf = buf + self.extrabuf
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000168 self.extrasize = len(buf) + self.extrasize
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000169
170 def _read(self, size=1024):
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000171 if self.fileobj is None: raise EOFError, "Reached EOF"
172
173 if self._new_member:
174 # If the _new_member flag is set, we have to
175 #
176 # First, check if we're at the end of the file;
177 # if so, it's time to stop; no more members to read.
178 pos = self.fileobj.tell() # Save current position
179 self.fileobj.seek(0, 2) # Seek to end of file
180 if pos == self.fileobj.tell():
181 self.fileobj = None
182 return EOFError, "Reached EOF"
183 else:
184 self.fileobj.seek( pos ) # Return to original position
185
186 self._init_read()
187 self._read_gzip_header()
188 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
189 self._new_member = 0
190
191 # Read a chunk of data from the file
192 buf = self.fileobj.read(size)
193
194 # If the EOF has been reached, flush the decompression object
195 # and mark this object as finished.
196
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000197 if buf == "":
198 uncompress = self.decompress.flush()
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000199 self._read_eof()
200 self.fileobj = None
201 self._add_read_data( uncompress )
202 raise EOFError, 'Reached EOF'
203
204 uncompress = self.decompress.decompress(buf)
205 self._add_read_data( uncompress )
206
207 if self.decompress.unused_data != "":
208 # Ending case: we've come to the end of a member in the file,
209 # so seek back to the start of the unused data, finish up
210 # this member, and read a new gzip header.
211 # (The number of bytes to seek back is the length of the unused
212 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
213 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
214
215 # Check the CRC and file size, and set the flag so we read
216 # a new member on the next call
217 self._read_eof()
218 self._new_member = 1
219
220 def _add_read_data(self, data):
221 self.crc = zlib.crc32(data, self.crc)
222 self.extrabuf = self.extrabuf + data
223 self.extrasize = self.extrasize + len(data)
224 self.size = self.size + len(data)
Guido van Rossum15262191997-04-30 16:04:57 +0000225
226 def _read_eof(self):
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000227 # We've read to the end of the file, so we have to rewind in order
228 # to reread the 8 bytes containing the CRC and the file size.
229 # We check the that the computed CRC and size of the
230 # uncompressed data matches the stored values.
231 self.fileobj.seek(-8, 1)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000232 crc32 = read32(self.fileobj)
233 isize = read32(self.fileobj)
234 if crc32 != self.crc:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000235 raise ValueError, "CRC check failed"
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000236 elif isize != self.size:
Andrew M. Kuchlingf4f119c1999-03-25 21:49:14 +0000237 raise ValueError, "Incorrect length of data produced"
238
Guido van Rossum15262191997-04-30 16:04:57 +0000239 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000240 if self.mode == WRITE:
241 self.fileobj.write(self.compress.flush())
242 write32(self.fileobj, self.crc)
243 write32(self.fileobj, self.size)
244 self.fileobj = None
245 elif self.mode == READ:
246 self.fileobj = None
247 if self.myfileobj:
248 self.myfileobj.close()
249 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000250
251 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000252 self.fileobj.flush()
Guido van Rossum15262191997-04-30 16:04:57 +0000253
254 def seek(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000255 raise IOError, 'Random access not allowed in gzip files'
Guido van Rossum15262191997-04-30 16:04:57 +0000256
257 def tell(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000258 raise IOError, 'I won\'t tell() you for gzip files'
Guido van Rossum15262191997-04-30 16:04:57 +0000259
260 def isatty(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000261 return 0
Guido van Rossum15262191997-04-30 16:04:57 +0000262
263 def readline(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000264 bufs = []
265 readsize = 100
266 while 1:
267 c = self.read(readsize)
268 i = string.find(c, '\n')
269 if i >= 0 or c == '':
Jeremy Hyltonee918cb1998-05-13 21:49:58 +0000270 bufs.append(c[:i+1])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000271 self._unread(c[i+1:])
272 return string.join(bufs, '')
273 bufs.append(c)
274 readsize = readsize * 2
Guido van Rossum15262191997-04-30 16:04:57 +0000275
276 def readlines(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000277 buf = self.read()
Guido van Rossum84c6fc91998-08-03 15:41:39 +0000278 lines = string.split(buf, '\n')
279 for i in range(len(lines)-1):
280 lines[i] = lines[i] + '\n'
281 if lines and not lines[-1]:
282 del lines[-1]
283 return lines
Guido van Rossum15262191997-04-30 16:04:57 +0000284
Guido van Rossum68de3791997-07-19 20:22:23 +0000285 def writelines(self, L):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000286 for line in L:
287 self.write(line)
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000288
289
290def _test():
291 # Act like gzip; with -d, act like gunzip.
292 # The input file is not deleted, however, nor are any other gzip
293 # options or features supported.
294 import sys
295 args = sys.argv[1:]
296 decompress = args and args[0] == "-d"
297 if decompress:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000298 args = args[1:]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000299 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000300 args = ["-"]
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000301 for arg in args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000302 if decompress:
303 if arg == "-":
304 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
305 g = sys.stdout
306 else:
307 if arg[-3:] != ".gz":
308 print "filename doesn't end in .gz:", `arg`
309 continue
310 f = open(arg, "rb")
311 g = __builtin__.open(arg[:-3], "wb")
312 else:
313 if arg == "-":
314 f = sys.stdin
315 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
316 else:
317 f = __builtin__.open(arg, "rb")
318 g = open(arg + ".gz", "wb")
319 while 1:
320 chunk = f.read(1024)
321 if not chunk:
322 break
323 g.write(chunk)
324 if g is not sys.stdout:
325 g.close()
326 if f is not sys.stdin:
327 f.close()
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000328
329if __name__ == '__main__':
330 _test()