blob: 2bcb0ab679a28a722a7c028d84617ef6dfd301a9 [file] [log] [blame]
Xavier Ducrohetb9582242009-12-01 13:03:49 -08001#!/usr/bin/python2.4
2#
3# Copyright (C) 2008 Google Inc.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""Module to compress directories in to series of zip files.
19
20This module will take a directory and compress all its contents, including
21child directories into a series of zip files named N.zip where 'N' ranges from
220 to infinity. The zip files will all be below a certain specified maximum
23threshold.
24
25The directory is compressed with a depth first traversal, each directory's
26file contents being compressed as it is visisted, before the compression of any
27child directory's contents. In this way the files within an archive are ordered
28and the archives themselves are ordered.
29
30The class also constructs a 'main.py' file intended for use with Google App
31Engine with a custom App Engine program not currently distributed with this
32code base. The custom App Engine runtime can leverage the index files written
33out by this class to more quickly locate which zip file to serve a given URL
34from.
35"""
36
37__author__ = 'jmatt@google.com (Justin Mattson)'
38
39import optparse
40import os
41import stat
42import sys
43import zipfile
44import divide_and_compress_constants
45
46
47def CreateOptionsParser():
48 """Creates the parser for command line arguments.
49
50 Returns:
51 A configured optparse.OptionParser object.
52 """
53 rtn = optparse.OptionParser()
54 rtn.add_option('-s', '--sourcefiles', dest='sourcefiles', default=None,
55 help='The directory containing the files to compress')
56 rtn.add_option('-d', '--destination', dest='destination', default=None,
57 help=('Where to put the archive files, this should not be'
58 ' a child of where the source files exist.'))
59 rtn.add_option('-f', '--filesize', dest='filesize', default='1M',
60 help=('Maximum size of archive files. A number followed by '
61 'a magnitude indicator either "B", "K", "M", or "G". '
62 'Examples:\n 1000000B == one million BYTES\n'
63 ' 1.2M == one point two MEGABYTES\n'
64 ' 1M == 1048576 BYTES'))
65 rtn.add_option('-n', '--nocompress', action='store_false', dest='compress',
66 default=True,
67 help=('Whether the archive files should be compressed, or '
68 'just a concatenation of the source files'))
69 return rtn
70
71
72def VerifyArguments(options, parser):
73 """Runs simple checks on correctness of commandline arguments.
74
75 Args:
76 options: The command line options passed.
77 parser: The parser object used to parse the command string.
78 """
79 try:
80 if options.sourcefiles is None or options.destination is None:
81 parser.print_help()
82 sys.exit(-1)
83 except AttributeError:
84 parser.print_help()
85 sys.exit(-1)
86
87
88def ParseSize(size_str):
89 """Parse the file size argument from a string to a number of bytes.
90
91 Args:
92 size_str: The string representation of the file size.
93
94 Returns:
95 The file size in bytes.
96
97 Raises:
98 ValueError: Raises an error if the numeric or qualifier portions of the
99 file size argument is invalid.
100 """
101 if len(size_str) < 2:
102 raise ValueError(('filesize argument not understood, please include'
103 ' a numeric value and magnitude indicator'))
104 magnitude = size_str[-1]
105 if not magnitude in ('B', 'K', 'M', 'G'):
106 raise ValueError(('filesize magnitude indicator not valid, must be "B",'
107 '"K","M", or "G"'))
108 numeral = float(size_str[:-1])
109 if magnitude == 'K':
110 numeral *= 1024
111 elif magnitude == 'M':
112 numeral *= 1048576
113 elif magnitude == 'G':
114 numeral *= 1073741824
115 return int(numeral)
116
117
118class DirectoryZipper(object):
119 """Class to compress a directory and all its sub-directories."""
120
121 def __init__(self, output_path, base_dir, archive_size, enable_compression):
122 """DirectoryZipper constructor.
123
124 Args:
125 output_path: A string, the path to write the archives and index file to.
126 base_dir: A string, the directory to compress.
127 archive_size: An number, the maximum size, in bytes, of a single
128 archive file.
129 enable_compression: A boolean, whether or not compression should be
130 enabled, if disabled, the files will be written into an uncompresed
131 zip.
132 """
133 self.output_dir = output_path
134 self.current_archive = '0.zip'
135 self.base_path = base_dir
136 self.max_size = archive_size
137 self.compress = enable_compression
138
139 # Set index_fp to None, because we don't know what it will be yet.
140 self.index_fp = None
141
142 def StartCompress(self):
143 """Start compress of the directory.
144
145 This will start the compression process and write the archives to the
146 specified output directory. It will also produce an 'index.txt' file in the
147 output directory that maps from file to archive.
148 """
149 self.index_fp = open(os.path.join(self.output_dir, 'main.py'), 'w')
150 self.index_fp.write(divide_and_compress_constants.file_preamble)
151 os.path.walk(self.base_path, self.CompressDirectory, 1)
152 self.index_fp.write(divide_and_compress_constants.file_endpiece)
153 self.index_fp.close()
154
155 def RemoveLastFile(self, archive_path=None):
156 """Removes the last item in the archive.
157
158 This removes the last item in the archive by reading the items out of the
159 archive, adding them to a new archive, deleting the old archive, and
160 moving the new archive to the location of the old archive.
161
162 Args:
163 archive_path: Path to the archive to modify. This archive should not be
164 open elsewhere, since it will need to be deleted.
165
166 Returns:
167 A new ZipFile object that points to the modified archive file.
168 """
169 if archive_path is None:
170 archive_path = os.path.join(self.output_dir, self.current_archive)
171
172 # Move the old file and create a new one at its old location.
173 root, ext = os.path.splitext(archive_path)
174 old_archive = ''.join([root, '-old', ext])
175 os.rename(archive_path, old_archive)
176 old_fp = self.OpenZipFileAtPath(old_archive, mode='r')
177
178 # By default, store uncompressed.
179 compress_bit = zipfile.ZIP_STORED
180 if self.compress:
181 compress_bit = zipfile.ZIP_DEFLATED
182 new_fp = self.OpenZipFileAtPath(archive_path,
183 mode='w',
184 compress=compress_bit)
185
186 # Read the old archive in a new archive, except the last one.
187 for zip_member in old_fp.infolist()[:-1]:
188 new_fp.writestr(zip_member, old_fp.read(zip_member.filename))
189
190 # Close files and delete the old one.
191 old_fp.close()
192 new_fp.close()
193 os.unlink(old_archive)
194
195 def OpenZipFileAtPath(self, path, mode=None, compress=zipfile.ZIP_DEFLATED):
196 """This method is mainly for testing purposes, eg dependency injection."""
197 if mode is None:
198 if os.path.exists(path):
199 mode = 'a'
200 else:
201 mode = 'w'
202
203 if mode == 'r':
204 return zipfile.ZipFile(path, mode)
205 else:
206 return zipfile.ZipFile(path, mode, compress)
207
208 def CompressDirectory(self, unused_id, dir_path, dir_contents):
209 """Method to compress the given directory.
210
211 This method compresses the directory 'dir_path'. It will add to an existing
212 zip file that still has space and create new ones as necessary to keep zip
213 file sizes under the maximum specified size. This also writes out the
214 mapping of files to archives to the self.index_fp file descriptor
215
216 Args:
217 unused_id: A numeric identifier passed by the os.path.walk method, this
218 is not used by this method.
219 dir_path: A string, the path to the directory to compress.
220 dir_contents: A list of directory contents to be compressed.
221 """
222 # Construct the queue of files to be added that this method will use
223 # it seems that dir_contents is given in reverse alphabetical order,
224 # so put them in alphabetical order by inserting to front of the list.
225 dir_contents.sort()
226 zip_queue = []
227 for filename in dir_contents:
228 zip_queue.append(os.path.join(dir_path, filename))
229 compress_bit = zipfile.ZIP_DEFLATED
230 if not self.compress:
231 compress_bit = zipfile.ZIP_STORED
232
233 # Zip all files in this directory, adding to existing archives and creating
234 # as necessary.
235 while zip_queue:
236 target_file = zip_queue[0]
237 if os.path.isfile(target_file):
238 self.AddFileToArchive(target_file, compress_bit)
239
240 # See if adding the new file made our archive too large.
241 if not self.ArchiveIsValid():
242
243 # IF fixing fails, the last added file was to large, skip it
244 # ELSE the current archive filled normally, make a new one and try
245 # adding the file again.
246 if not self.FixArchive('SIZE'):
247 zip_queue.pop(0)
248 else:
249 self.current_archive = '%i.zip' % (
250 int(self.current_archive[
251 0:self.current_archive.rfind('.zip')]) + 1)
252 else:
253
254 # Write an index record if necessary.
255 self.WriteIndexRecord()
256 zip_queue.pop(0)
257 else:
258 zip_queue.pop(0)
259
260 def WriteIndexRecord(self):
261 """Write an index record to the index file.
262
263 Only write an index record if this is the first file to go into archive
264
265 Returns:
266 True if an archive record is written, False if it isn't.
267 """
268 archive = self.OpenZipFileAtPath(
269 os.path.join(self.output_dir, self.current_archive), 'r')
270 archive_index = archive.infolist()
271 if len(archive_index) == 1:
272 self.index_fp.write(
273 '[\'%s\', \'%s\'],\n' % (self.current_archive,
274 archive_index[0].filename))
275 archive.close()
276 return True
277 else:
278 archive.close()
279 return False
280
281 def FixArchive(self, problem):
282 """Make the archive compliant.
283
284 Args:
285 problem: An enum, the reason the archive is invalid.
286
287 Returns:
288 Whether the file(s) removed to fix the archive could conceivably be
289 in an archive, but for some reason can't be added to this one.
290 """
291 archive_path = os.path.join(self.output_dir, self.current_archive)
292 return_value = None
293
294 if problem == 'SIZE':
295 archive_obj = self.OpenZipFileAtPath(archive_path, mode='r')
296 num_archive_files = len(archive_obj.infolist())
297
298 # IF there is a single file, that means its too large to compress,
299 # delete the created archive
300 # ELSE do normal finalization.
301 if num_archive_files == 1:
302 print ('WARNING: %s%s is too large to store.' % (
303 self.base_path, archive_obj.infolist()[0].filename))
304 archive_obj.close()
305 os.unlink(archive_path)
306 return_value = False
307 else:
308 archive_obj.close()
309 self.RemoveLastFile(
310 os.path.join(self.output_dir, self.current_archive))
311 print 'Final archive size for %s is %i' % (
312 self.current_archive, os.path.getsize(archive_path))
313 return_value = True
314 return return_value
315
316 def AddFileToArchive(self, filepath, compress_bit):
317 """Add the file at filepath to the current archive.
318
319 Args:
320 filepath: A string, the path of the file to add.
321 compress_bit: A boolean, whether or not this file should be compressed
322 when added.
323
324 Returns:
325 True if the file could be added (typically because this is a file) or
326 False if it couldn't be added (typically because its a directory).
327 """
328 curr_archive_path = os.path.join(self.output_dir, self.current_archive)
329 if os.path.isfile(filepath) and not os.path.islink(filepath):
330 if os.path.getsize(filepath) > 1048576:
331 print 'Warning: %s is potentially too large to serve on GAE' % filepath
332 archive = self.OpenZipFileAtPath(curr_archive_path,
333 compress=compress_bit)
334 # Add the file to the archive.
335 archive.write(filepath, filepath[len(self.base_path):])
336 archive.close()
337 return True
338 else:
339 return False
340
341 def ArchiveIsValid(self):
342 """Check whether the archive is valid.
343
344 Currently this only checks whether the archive is under the required size.
345 The thought is that eventually this will do additional validation
346
347 Returns:
348 True if the archive is valid, False if its not.
349 """
350 archive_path = os.path.join(self.output_dir, self.current_archive)
351 return os.path.getsize(archive_path) <= self.max_size
352
353
354def main(argv):
355 parser = CreateOptionsParser()
356 (options, unused_args) = parser.parse_args(args=argv[1:])
357 VerifyArguments(options, parser)
358 zipper = DirectoryZipper(options.destination,
359 options.sourcefiles,
360 ParseSize(options.filesize),
361 options.compress)
362 zipper.StartCompress()
363
364
365if __name__ == '__main__':
366 main(sys.argv)