blob: 3d656b927e8a855321bf82b508f359382b510ef6 [file] [log] [blame]
Guido van Rossum15262191997-04-30 16:04:57 +00001import time
2import string
3import zlib
4import StringIO
Guido van Rossum68de3791997-07-19 20:22:23 +00005import __builtin__
Guido van Rossum15262191997-04-30 16:04:57 +00006
7# implements a python function that reads and writes a gzipped file
8# the user of the file doesn't have to worry about the compression,
Guido van Rossum51ca6e31997-12-30 20:09:08 +00009# but random access is not allowed
Guido van Rossum15262191997-04-30 16:04:57 +000010
11# based on Andrew Kuchling's minigzip.py distributed with the zlib module
12
13FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
14
15READ, WRITE = 1, 2
16
17def write32(output, value):
18 t = divmod(value, 256)
19 b1 = chr(t[1])
20
21 t = divmod(t[0], 256)
22 b2 = chr(t[1])
23
24 t = divmod(t[0], 256)
25 b3 = chr(t[1])
26
27 t = divmod(t[0], 256)
28 b4 = chr(t[1])
29
30 buf = b1 + b2 + b3 + b4
31 output.write(buf)
32
33
34def read32(input):
35 buf = input.read(4)
36 v = ord(buf[0])
37 v = v + (ord(buf[1]) << 8)
38 v = v + (ord(buf[2]) << 16)
39 v = v + (ord(buf[3]) << 24)
40 return v
41
Guido van Rossum68de3791997-07-19 20:22:23 +000042def open(filename, mode="r", compresslevel=9):
Guido van Rossum15262191997-04-30 16:04:57 +000043 return GzipFile(filename, mode, compresslevel)
44
45class GzipFile:
46
Guido van Rossum68de3791997-07-19 20:22:23 +000047 myfileobj = None
48
49 def __init__(self, filename=None, mode=None,
50 compresslevel=9, fileobj=None):
51 if fileobj is None:
52 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'r')
53 if filename is None:
54 if hasattr(fileobj, 'name'): filename = fileobj.name
Guido van Rossum51ca6e31997-12-30 20:09:08 +000055 else: filename = ''
Guido van Rossum68de3791997-07-19 20:22:23 +000056 if mode is None:
57 if hasattr(fileobj, 'mode'): mode = fileobj.mode
58 else: mode = 'r'
59
60 if mode[0:1] == 'r':
Guido van Rossum15262191997-04-30 16:04:57 +000061 self.mode = READ
62 self._init_read()
63 self.filename = filename
64 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
65
Guido van Rossum68de3791997-07-19 20:22:23 +000066 elif mode[0:1] == 'w':
Guido van Rossum15262191997-04-30 16:04:57 +000067 self.mode = WRITE
68 self._init_write(filename)
69 self.compress = zlib.compressobj(compresslevel,
70 zlib.DEFLATED,
71 -zlib.MAX_WBITS,
72 zlib.DEF_MEM_LEVEL,
73 0)
74 else:
75 raise ValueError, "Mode " + mode + " not supported"
76
Guido van Rossum68de3791997-07-19 20:22:23 +000077 self.fileobj = fileobj
Guido van Rossum15262191997-04-30 16:04:57 +000078
79 if self.mode == WRITE:
80 self._write_gzip_header()
81 elif self.mode == READ:
82 self._read_gzip_header()
Guido van Rossum15262191997-04-30 16:04:57 +000083
84 def __repr__(self):
85 s = repr(self.fileobj)
86 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
87
88 def _init_write(self, filename):
89 if filename[-3:] != '.gz':
90 filename = filename + '.gz'
91 self.filename = filename
92 self.crc = zlib.crc32("")
93 self.size = 0
94 self.writebuf = []
95 self.bufsize = 0
96
97 def _write_gzip_header(self):
98 self.fileobj.write('\037\213') # magic header
99 self.fileobj.write('\010') # compression method
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000100 fname = self.filename[:-3]
101 flags = 0
102 if fname:
103 flags = FNAME
104 self.fileobj.write(chr(flags))
Guido van Rossum15262191997-04-30 16:04:57 +0000105 write32(self.fileobj, int(time.time()))
106 self.fileobj.write('\002')
107 self.fileobj.write('\377')
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000108 if fname:
109 self.fileobj.write(fname + '\000')
Guido van Rossum15262191997-04-30 16:04:57 +0000110
111 def _init_read(self):
112 self.crc = zlib.crc32("")
113 self.size = 0
114 self.extrabuf = ""
115 self.extrasize = 0
116
117 def _read_gzip_header(self):
118 magic = self.fileobj.read(2)
119 if magic != '\037\213':
120 raise RuntimeError, 'Not a gzipped file'
121 method = ord( self.fileobj.read(1) )
122 if method != 8:
123 raise RuntimeError, 'Unknown compression method'
124 flag = ord( self.fileobj.read(1) )
125 # modtime = self.fileobj.read(4)
126 # extraflag = self.fileobj.read(1)
127 # os = self.fileobj.read(1)
128 self.fileobj.read(6)
129
130 if flag & FEXTRA:
131 # Read & discard the extra field, if present
132 xlen=ord(self.fileobj.read(1))
133 xlen=xlen+256*ord(self.fileobj.read(1))
134 self.fileobj.read(xlen)
135 if flag & FNAME:
136 # Read and discard a null-terminated string containing the filename
137 while (1):
138 s=self.fileobj.read(1)
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000139 if not s or s=='\000': break
Guido van Rossum15262191997-04-30 16:04:57 +0000140 if flag & FCOMMENT:
141 # Read and discard a null-terminated string containing a comment
142 while (1):
143 s=self.fileobj.read(1)
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000144 if not s or s=='\000': break
Guido van Rossum15262191997-04-30 16:04:57 +0000145 if flag & FHCRC:
146 self.fileobj.read(2) # Read & discard the 16-bit header CRC
147
148
149 def write(self,data):
Guido van Rossum68de3791997-07-19 20:22:23 +0000150 if self.fileobj is None:
151 raise ValueError, "write() on closed GzipFile object"
Guido van Rossum15262191997-04-30 16:04:57 +0000152 if len(data) > 0:
153 self.size = self.size + len(data)
154 self.crc = zlib.crc32(data, self.crc)
155 self.fileobj.write( self.compress.compress(data) )
156
157 def writelines(self,lines):
158 self.write(string.join(lines))
159
160 def read(self,size=None):
Guido van Rossum68de3791997-07-19 20:22:23 +0000161 if self.extrasize <= 0 and self.fileobj is None:
Guido van Rossum15262191997-04-30 16:04:57 +0000162 return ''
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000163
164 readsize = 1024
165 if not size: # get the whole thing
Guido van Rossum15262191997-04-30 16:04:57 +0000166 try:
167 while 1:
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000168 self._read(readsize)
169 readsize = readsize * 2
Guido van Rossum15262191997-04-30 16:04:57 +0000170 except EOFError:
171 size = self.extrasize
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000172 else: # just get some more of it
Guido van Rossum15262191997-04-30 16:04:57 +0000173 try:
174 while size > self.extrasize:
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000175 self._read(readsize)
176 readsize = readsize * 2
Guido van Rossum15262191997-04-30 16:04:57 +0000177 except EOFError:
178 pass
179
180 chunk = self.extrabuf[:size]
181 self.extrabuf = self.extrabuf[size:]
182 self.extrasize = self.extrasize - size
183
184 return chunk
185
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000186 def _unread(self, buf):
187 self.extrabuf = buf + self.extrabuf
188 self.extrasize = len(buf) + self.extrasize
189
190 def _read(self, size=1024):
191 try:
192 buf = self.fileobj.read(size)
193 except AttributeError:
194 raise EOFError, "Reached EOF"
Guido van Rossum15262191997-04-30 16:04:57 +0000195 if buf == "":
196 uncompress = self.decompress.flush()
197 if uncompress == "":
198 self._read_eof()
Guido van Rossum68de3791997-07-19 20:22:23 +0000199 self.fileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000200 raise EOFError, 'Reached EOF'
201 else:
202 uncompress = self.decompress.decompress(buf)
203 self.crc = zlib.crc32(uncompress, self.crc)
204 self.extrabuf = self.extrabuf + uncompress
205 self.extrasize = self.extrasize + len(uncompress)
206 self.size = self.size + len(uncompress)
207
208 def _read_eof(self):
209 # Andrew writes:
210 ## We've read to the end of the file, so we have to rewind in order
211 ## to reread the 8 bytes containing the CRC and the file size. The
212 ## decompressor is smart and knows when to stop, so feeding it
213 ## extra data is harmless.
214 self.fileobj.seek(-8, 2)
215 crc32 = read32(self.fileobj)
216 isize = read32(self.fileobj)
217 if crc32 != self.crc:
218 self.error = "CRC check failed"
219 elif isize != self.size:
220 self.error = "Incorrect length of data produced"
221
222 def close(self):
223 if self.mode == WRITE:
224 self.fileobj.write(self.compress.flush())
225 write32(self.fileobj, self.crc)
226 write32(self.fileobj, self.size)
Guido van Rossum68de3791997-07-19 20:22:23 +0000227 self.fileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000228 elif self.mode == READ:
Guido van Rossum68de3791997-07-19 20:22:23 +0000229 self.fileobj = None
230 if self.myfileobj:
231 self.myfileobj.close()
232 self.myfileobj = None
Guido van Rossum15262191997-04-30 16:04:57 +0000233
234 def flush(self):
235 self.fileobj.flush()
236
237 def seek(self):
238 raise IOError, 'Random access not allowed in gzip files'
239
240 def tell(self):
241 raise IOError, 'I won\'t tell() you for gzip files'
242
243 def isatty(self):
244 return 0
245
246 def readline(self):
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000247 bufs = []
248 readsize = 100
Guido van Rossum68de3791997-07-19 20:22:23 +0000249 while 1:
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000250 c = self.read(readsize)
251 i = string.find(c, '\n')
252 if i >= 0 or c == '':
253 bufs.append(c[:i])
254 self._unread(c[i+1:])
255 return string.join(bufs, '')
256 bufs.append(c)
257 readsize = readsize * 2
Guido van Rossum15262191997-04-30 16:04:57 +0000258
259 def readlines(self):
Guido van Rossumb16a3b81998-01-27 19:29:45 +0000260 buf = self.read()
261 return string.split(buf, '\n')
Guido van Rossum15262191997-04-30 16:04:57 +0000262
Guido van Rossum68de3791997-07-19 20:22:23 +0000263 def writelines(self, L):
264 for line in L:
265 self.write(line)
Guido van Rossum51ca6e31997-12-30 20:09:08 +0000266
267
268def _test():
269 # Act like gzip; with -d, act like gunzip.
270 # The input file is not deleted, however, nor are any other gzip
271 # options or features supported.
272 import sys
273 args = sys.argv[1:]
274 decompress = args and args[0] == "-d"
275 if decompress:
276 args = args[1:]
277 if not args:
278 args = ["-"]
279 for arg in args:
280 if decompress:
281 if arg == "-":
282 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
283 g = sys.stdout
284 else:
285 if arg[-3:] != ".gz":
286 print "filename doesn't end in .gz:", `arg`
287 continue
288 f = open(arg, "rb")
289 g = __builtin__.open(arg[:-3], "wb")
290 else:
291 if arg == "-":
292 f = sys.stdin
293 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
294 else:
295 f = __builtin__.open(arg, "rb")
296 g = open(arg + ".gz", "wb")
297 while 1:
298 chunk = f.read(1024)
299 if not chunk:
300 break
301 g.write(chunk)
302 if g is not sys.stdout:
303 g.close()
304 if f is not sys.stdin:
305 f.close()
306
307if __name__ == '__main__':
308 _test()