blob: b9f80bbd9ef522cc96c8b36e9f6461143b383912 [file] [log] [blame]
Guido van Rossum15262191997-04-30 16:04:57 +00001import time
2import string
3import zlib
4import StringIO
5
6# implements a python function that reads and writes a gzipped file
7# the user of the file doesn't have to worry about the compression,
8# but sequential access is not allowed
9
10# based on Andrew Kuchling's minigzip.py distributed with the zlib module
11
12FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
13
14READ, WRITE = 1, 2
15
16def write32(output, value):
17 t = divmod(value, 256)
18 b1 = chr(t[1])
19
20 t = divmod(t[0], 256)
21 b2 = chr(t[1])
22
23 t = divmod(t[0], 256)
24 b3 = chr(t[1])
25
26 t = divmod(t[0], 256)
27 b4 = chr(t[1])
28
29 buf = b1 + b2 + b3 + b4
30 output.write(buf)
31
32
33def read32(input):
34 buf = input.read(4)
35 v = ord(buf[0])
36 v = v + (ord(buf[1]) << 8)
37 v = v + (ord(buf[2]) << 16)
38 v = v + (ord(buf[3]) << 24)
39 return v
40
41written = []
42
43_py_open = open
44
45def open(filename, mode, compresslevel=9):
46 return GzipFile(filename, mode, compresslevel)
47
48class GzipFile:
49
50 def __init__(self, filename, mode='r', compresslevel=9):
51 if mode == 'r' or mode == 'rb':
52 self.mode = READ
53 self._init_read()
54 self.filename = filename
55 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
56
57 elif mode == 'w' or mode == 'wb':
58 self.mode = WRITE
59 self._init_write(filename)
60 self.compress = zlib.compressobj(compresslevel,
61 zlib.DEFLATED,
62 -zlib.MAX_WBITS,
63 zlib.DEF_MEM_LEVEL,
64 0)
65 else:
66 raise ValueError, "Mode " + mode + " not supported"
67
68 self.fileobj = _py_open(self.filename,mode)
69
70 if self.mode == WRITE:
71 self._write_gzip_header()
72 elif self.mode == READ:
73 self._read_gzip_header()
74
75
76 def __repr__(self):
77 s = repr(self.fileobj)
78 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
79
80 def _init_write(self, filename):
81 if filename[-3:] != '.gz':
82 filename = filename + '.gz'
83 self.filename = filename
84 self.crc = zlib.crc32("")
85 self.size = 0
86 self.writebuf = []
87 self.bufsize = 0
88
89 def _write_gzip_header(self):
90 self.fileobj.write('\037\213') # magic header
91 self.fileobj.write('\010') # compression method
92 self.fileobj.write(chr(FNAME))
93 write32(self.fileobj, int(time.time()))
94 self.fileobj.write('\002')
95 self.fileobj.write('\377')
96 self.fileobj.write(self.filename[:-3] + '\000')
97
98 def _init_read(self):
99 self.crc = zlib.crc32("")
100 self.size = 0
101 self.extrabuf = ""
102 self.extrasize = 0
103
104 def _read_gzip_header(self):
105 magic = self.fileobj.read(2)
106 if magic != '\037\213':
107 raise RuntimeError, 'Not a gzipped file'
108 method = ord( self.fileobj.read(1) )
109 if method != 8:
110 raise RuntimeError, 'Unknown compression method'
111 flag = ord( self.fileobj.read(1) )
112 # modtime = self.fileobj.read(4)
113 # extraflag = self.fileobj.read(1)
114 # os = self.fileobj.read(1)
115 self.fileobj.read(6)
116
117 if flag & FEXTRA:
118 # Read & discard the extra field, if present
119 xlen=ord(self.fileobj.read(1))
120 xlen=xlen+256*ord(self.fileobj.read(1))
121 self.fileobj.read(xlen)
122 if flag & FNAME:
123 # Read and discard a null-terminated string containing the filename
124 while (1):
125 s=self.fileobj.read(1)
126 if s=='\000': break
127 if flag & FCOMMENT:
128 # Read and discard a null-terminated string containing a comment
129 while (1):
130 s=self.fileobj.read(1)
131 if s=='\000': break
132 if flag & FHCRC:
133 self.fileobj.read(2) # Read & discard the 16-bit header CRC
134
135
136 def write(self,data):
137 if len(data) > 0:
138 self.size = self.size + len(data)
139 self.crc = zlib.crc32(data, self.crc)
140 self.fileobj.write( self.compress.compress(data) )
141
142 def writelines(self,lines):
143 self.write(string.join(lines))
144
145 def read(self,size=None):
146 if self.extrasize <= 0 and self.fileobj.closed:
147 return ''
148
149 if not size:
150 # get the whole thing
151 try:
152 while 1:
153 self._read()
154 except EOFError:
155 size = self.extrasize
156 else:
157 # just get some more of it
158 try:
159 while size > self.extrasize:
160 self._read()
161 except EOFError:
162 pass
163
164 chunk = self.extrabuf[:size]
165 self.extrabuf = self.extrabuf[size:]
166 self.extrasize = self.extrasize - size
167
168 return chunk
169
170 def _read(self):
171 buf = self.fileobj.read(1024)
172 if buf == "":
173 uncompress = self.decompress.flush()
174 if uncompress == "":
175 self._read_eof()
176 self.fileobj.close()
177 raise EOFError, 'Reached EOF'
178 else:
179 uncompress = self.decompress.decompress(buf)
180 self.crc = zlib.crc32(uncompress, self.crc)
181 self.extrabuf = self.extrabuf + uncompress
182 self.extrasize = self.extrasize + len(uncompress)
183 self.size = self.size + len(uncompress)
184
185 def _read_eof(self):
186 # Andrew writes:
187 ## We've read to the end of the file, so we have to rewind in order
188 ## to reread the 8 bytes containing the CRC and the file size. The
189 ## decompressor is smart and knows when to stop, so feeding it
190 ## extra data is harmless.
191 self.fileobj.seek(-8, 2)
192 crc32 = read32(self.fileobj)
193 isize = read32(self.fileobj)
194 if crc32 != self.crc:
195 self.error = "CRC check failed"
196 elif isize != self.size:
197 self.error = "Incorrect length of data produced"
198
199 def close(self):
200 if self.mode == WRITE:
201 self.fileobj.write(self.compress.flush())
202 write32(self.fileobj, self.crc)
203 write32(self.fileobj, self.size)
204 self.fileobj.close()
205 elif self.mode == READ:
206 self.fileobj.close()
207
208 def flush(self):
209 self.fileobj.flush()
210
211 def seek(self):
212 raise IOError, 'Random access not allowed in gzip files'
213
214 def tell(self):
215 raise IOError, 'I won\'t tell() you for gzip files'
216
217 def isatty(self):
218 return 0
219
220 def readline(self):
221 # should I bother with this
222 raise RuntimeError, "not implemented"
223
224 def readlines(self):
225 # should I bother with this
226 raise RuntimeError, "not implemented"
227
228
229class StringIOgz(GzipFile):
230
231 """A StringIO substitute that reads/writes gzipped buffers."""
232
233 def __init__(self, buf=None, filename="StringIOgz"):
234 """Read/write mode depends on first argument.
235
236 If __init__ is passed a buffer, it will treat that as the
237 gzipped data and set up the StringIO for reading. Without the
238 initial argument, it will assume a new file for writing.
239
240 The filename argument is written in the header of buffers
241 opened for writing. Not sure that this is useful, but the
242 GzipFile code expects *some* filename."""
243
244 if buf:
245 self.mode = READ
246 self._init_read()
247 self.filename = filename
248 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
249 self.fileobj = StringIO.StringIO(buf)
250 else:
251 self.mode = WRITE
252 self._init_write(filename)
253 self.compress = zlib.compressobj(compresslevel,
254 zlib.DEFLATED,
255 -zlib.MAX_WBITS,
256 zlib.DEF_MEM_LEVEL,
257 0)
258 self.fileobj = StringIO.StringIO()
259
260 if self.mode == WRITE:
261 self._write_gzip_header()
262 elif self.mode == READ:
263 self._read_gzip_header()
264