Xavier Ducrohet | b958224 | 2009-12-01 13:03:49 -0800 | [diff] [blame] | 1 | #!/usr/bin/python2.4 |
| 2 | # |
| 3 | # Copyright (C) 2008 Google Inc. |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | |
| 18 | """Module to compress directories in to series of zip files. |
| 19 | |
| 20 | This module will take a directory and compress all its contents, including |
| 21 | child directories into a series of zip files named N.zip where 'N' ranges from |
| 22 | 0 to infinity. The zip files will all be below a certain specified maximum |
| 23 | threshold. |
| 24 | |
| 25 | The directory is compressed with a depth first traversal, each directory's |
| 26 | file contents being compressed as it is visisted, before the compression of any |
| 27 | child directory's contents. In this way the files within an archive are ordered |
| 28 | and the archives themselves are ordered. |
| 29 | |
| 30 | The class also constructs a 'main.py' file intended for use with Google App |
| 31 | Engine with a custom App Engine program not currently distributed with this |
| 32 | code base. The custom App Engine runtime can leverage the index files written |
| 33 | out by this class to more quickly locate which zip file to serve a given URL |
| 34 | from. |
| 35 | """ |
| 36 | |
| 37 | __author__ = 'jmatt@google.com (Justin Mattson)' |
| 38 | |
| 39 | import optparse |
| 40 | import os |
| 41 | import stat |
| 42 | import sys |
| 43 | import zipfile |
| 44 | import divide_and_compress_constants |
| 45 | |
| 46 | |
| 47 | def CreateOptionsParser(): |
| 48 | """Creates the parser for command line arguments. |
| 49 | |
| 50 | Returns: |
| 51 | A configured optparse.OptionParser object. |
| 52 | """ |
| 53 | rtn = optparse.OptionParser() |
| 54 | rtn.add_option('-s', '--sourcefiles', dest='sourcefiles', default=None, |
| 55 | help='The directory containing the files to compress') |
| 56 | rtn.add_option('-d', '--destination', dest='destination', default=None, |
| 57 | help=('Where to put the archive files, this should not be' |
| 58 | ' a child of where the source files exist.')) |
| 59 | rtn.add_option('-f', '--filesize', dest='filesize', default='1M', |
| 60 | help=('Maximum size of archive files. A number followed by ' |
| 61 | 'a magnitude indicator either "B", "K", "M", or "G". ' |
| 62 | 'Examples:\n 1000000B == one million BYTES\n' |
| 63 | ' 1.2M == one point two MEGABYTES\n' |
| 64 | ' 1M == 1048576 BYTES')) |
| 65 | rtn.add_option('-n', '--nocompress', action='store_false', dest='compress', |
| 66 | default=True, |
| 67 | help=('Whether the archive files should be compressed, or ' |
| 68 | 'just a concatenation of the source files')) |
| 69 | return rtn |
| 70 | |
| 71 | |
| 72 | def VerifyArguments(options, parser): |
| 73 | """Runs simple checks on correctness of commandline arguments. |
| 74 | |
| 75 | Args: |
| 76 | options: The command line options passed. |
| 77 | parser: The parser object used to parse the command string. |
| 78 | """ |
| 79 | try: |
| 80 | if options.sourcefiles is None or options.destination is None: |
| 81 | parser.print_help() |
| 82 | sys.exit(-1) |
| 83 | except AttributeError: |
| 84 | parser.print_help() |
| 85 | sys.exit(-1) |
| 86 | |
| 87 | |
| 88 | def ParseSize(size_str): |
| 89 | """Parse the file size argument from a string to a number of bytes. |
| 90 | |
| 91 | Args: |
| 92 | size_str: The string representation of the file size. |
| 93 | |
| 94 | Returns: |
| 95 | The file size in bytes. |
| 96 | |
| 97 | Raises: |
| 98 | ValueError: Raises an error if the numeric or qualifier portions of the |
| 99 | file size argument is invalid. |
| 100 | """ |
| 101 | if len(size_str) < 2: |
| 102 | raise ValueError(('filesize argument not understood, please include' |
| 103 | ' a numeric value and magnitude indicator')) |
| 104 | magnitude = size_str[-1] |
| 105 | if not magnitude in ('B', 'K', 'M', 'G'): |
| 106 | raise ValueError(('filesize magnitude indicator not valid, must be "B",' |
| 107 | '"K","M", or "G"')) |
| 108 | numeral = float(size_str[:-1]) |
| 109 | if magnitude == 'K': |
| 110 | numeral *= 1024 |
| 111 | elif magnitude == 'M': |
| 112 | numeral *= 1048576 |
| 113 | elif magnitude == 'G': |
| 114 | numeral *= 1073741824 |
| 115 | return int(numeral) |
| 116 | |
| 117 | |
| 118 | class DirectoryZipper(object): |
| 119 | """Class to compress a directory and all its sub-directories.""" |
| 120 | |
| 121 | def __init__(self, output_path, base_dir, archive_size, enable_compression): |
| 122 | """DirectoryZipper constructor. |
| 123 | |
| 124 | Args: |
| 125 | output_path: A string, the path to write the archives and index file to. |
| 126 | base_dir: A string, the directory to compress. |
| 127 | archive_size: An number, the maximum size, in bytes, of a single |
| 128 | archive file. |
| 129 | enable_compression: A boolean, whether or not compression should be |
| 130 | enabled, if disabled, the files will be written into an uncompresed |
| 131 | zip. |
| 132 | """ |
| 133 | self.output_dir = output_path |
| 134 | self.current_archive = '0.zip' |
| 135 | self.base_path = base_dir |
| 136 | self.max_size = archive_size |
| 137 | self.compress = enable_compression |
| 138 | |
| 139 | # Set index_fp to None, because we don't know what it will be yet. |
| 140 | self.index_fp = None |
| 141 | |
| 142 | def StartCompress(self): |
| 143 | """Start compress of the directory. |
| 144 | |
| 145 | This will start the compression process and write the archives to the |
| 146 | specified output directory. It will also produce an 'index.txt' file in the |
| 147 | output directory that maps from file to archive. |
| 148 | """ |
| 149 | self.index_fp = open(os.path.join(self.output_dir, 'main.py'), 'w') |
| 150 | self.index_fp.write(divide_and_compress_constants.file_preamble) |
| 151 | os.path.walk(self.base_path, self.CompressDirectory, 1) |
| 152 | self.index_fp.write(divide_and_compress_constants.file_endpiece) |
| 153 | self.index_fp.close() |
| 154 | |
| 155 | def RemoveLastFile(self, archive_path=None): |
| 156 | """Removes the last item in the archive. |
| 157 | |
| 158 | This removes the last item in the archive by reading the items out of the |
| 159 | archive, adding them to a new archive, deleting the old archive, and |
| 160 | moving the new archive to the location of the old archive. |
| 161 | |
| 162 | Args: |
| 163 | archive_path: Path to the archive to modify. This archive should not be |
| 164 | open elsewhere, since it will need to be deleted. |
| 165 | |
| 166 | Returns: |
| 167 | A new ZipFile object that points to the modified archive file. |
| 168 | """ |
| 169 | if archive_path is None: |
| 170 | archive_path = os.path.join(self.output_dir, self.current_archive) |
| 171 | |
| 172 | # Move the old file and create a new one at its old location. |
| 173 | root, ext = os.path.splitext(archive_path) |
| 174 | old_archive = ''.join([root, '-old', ext]) |
| 175 | os.rename(archive_path, old_archive) |
| 176 | old_fp = self.OpenZipFileAtPath(old_archive, mode='r') |
| 177 | |
| 178 | # By default, store uncompressed. |
| 179 | compress_bit = zipfile.ZIP_STORED |
| 180 | if self.compress: |
| 181 | compress_bit = zipfile.ZIP_DEFLATED |
| 182 | new_fp = self.OpenZipFileAtPath(archive_path, |
| 183 | mode='w', |
| 184 | compress=compress_bit) |
| 185 | |
| 186 | # Read the old archive in a new archive, except the last one. |
| 187 | for zip_member in old_fp.infolist()[:-1]: |
| 188 | new_fp.writestr(zip_member, old_fp.read(zip_member.filename)) |
| 189 | |
| 190 | # Close files and delete the old one. |
| 191 | old_fp.close() |
| 192 | new_fp.close() |
| 193 | os.unlink(old_archive) |
| 194 | |
| 195 | def OpenZipFileAtPath(self, path, mode=None, compress=zipfile.ZIP_DEFLATED): |
| 196 | """This method is mainly for testing purposes, eg dependency injection.""" |
| 197 | if mode is None: |
| 198 | if os.path.exists(path): |
| 199 | mode = 'a' |
| 200 | else: |
| 201 | mode = 'w' |
| 202 | |
| 203 | if mode == 'r': |
| 204 | return zipfile.ZipFile(path, mode) |
| 205 | else: |
| 206 | return zipfile.ZipFile(path, mode, compress) |
| 207 | |
| 208 | def CompressDirectory(self, unused_id, dir_path, dir_contents): |
| 209 | """Method to compress the given directory. |
| 210 | |
| 211 | This method compresses the directory 'dir_path'. It will add to an existing |
| 212 | zip file that still has space and create new ones as necessary to keep zip |
| 213 | file sizes under the maximum specified size. This also writes out the |
| 214 | mapping of files to archives to the self.index_fp file descriptor |
| 215 | |
| 216 | Args: |
| 217 | unused_id: A numeric identifier passed by the os.path.walk method, this |
| 218 | is not used by this method. |
| 219 | dir_path: A string, the path to the directory to compress. |
| 220 | dir_contents: A list of directory contents to be compressed. |
| 221 | """ |
| 222 | # Construct the queue of files to be added that this method will use |
| 223 | # it seems that dir_contents is given in reverse alphabetical order, |
| 224 | # so put them in alphabetical order by inserting to front of the list. |
| 225 | dir_contents.sort() |
| 226 | zip_queue = [] |
| 227 | for filename in dir_contents: |
| 228 | zip_queue.append(os.path.join(dir_path, filename)) |
| 229 | compress_bit = zipfile.ZIP_DEFLATED |
| 230 | if not self.compress: |
| 231 | compress_bit = zipfile.ZIP_STORED |
| 232 | |
| 233 | # Zip all files in this directory, adding to existing archives and creating |
| 234 | # as necessary. |
| 235 | while zip_queue: |
| 236 | target_file = zip_queue[0] |
| 237 | if os.path.isfile(target_file): |
| 238 | self.AddFileToArchive(target_file, compress_bit) |
| 239 | |
| 240 | # See if adding the new file made our archive too large. |
| 241 | if not self.ArchiveIsValid(): |
| 242 | |
| 243 | # IF fixing fails, the last added file was to large, skip it |
| 244 | # ELSE the current archive filled normally, make a new one and try |
| 245 | # adding the file again. |
| 246 | if not self.FixArchive('SIZE'): |
| 247 | zip_queue.pop(0) |
| 248 | else: |
| 249 | self.current_archive = '%i.zip' % ( |
| 250 | int(self.current_archive[ |
| 251 | 0:self.current_archive.rfind('.zip')]) + 1) |
| 252 | else: |
| 253 | |
| 254 | # Write an index record if necessary. |
| 255 | self.WriteIndexRecord() |
| 256 | zip_queue.pop(0) |
| 257 | else: |
| 258 | zip_queue.pop(0) |
| 259 | |
| 260 | def WriteIndexRecord(self): |
| 261 | """Write an index record to the index file. |
| 262 | |
| 263 | Only write an index record if this is the first file to go into archive |
| 264 | |
| 265 | Returns: |
| 266 | True if an archive record is written, False if it isn't. |
| 267 | """ |
| 268 | archive = self.OpenZipFileAtPath( |
| 269 | os.path.join(self.output_dir, self.current_archive), 'r') |
| 270 | archive_index = archive.infolist() |
| 271 | if len(archive_index) == 1: |
| 272 | self.index_fp.write( |
| 273 | '[\'%s\', \'%s\'],\n' % (self.current_archive, |
| 274 | archive_index[0].filename)) |
| 275 | archive.close() |
| 276 | return True |
| 277 | else: |
| 278 | archive.close() |
| 279 | return False |
| 280 | |
| 281 | def FixArchive(self, problem): |
| 282 | """Make the archive compliant. |
| 283 | |
| 284 | Args: |
| 285 | problem: An enum, the reason the archive is invalid. |
| 286 | |
| 287 | Returns: |
| 288 | Whether the file(s) removed to fix the archive could conceivably be |
| 289 | in an archive, but for some reason can't be added to this one. |
| 290 | """ |
| 291 | archive_path = os.path.join(self.output_dir, self.current_archive) |
| 292 | return_value = None |
| 293 | |
| 294 | if problem == 'SIZE': |
| 295 | archive_obj = self.OpenZipFileAtPath(archive_path, mode='r') |
| 296 | num_archive_files = len(archive_obj.infolist()) |
| 297 | |
| 298 | # IF there is a single file, that means its too large to compress, |
| 299 | # delete the created archive |
| 300 | # ELSE do normal finalization. |
| 301 | if num_archive_files == 1: |
| 302 | print ('WARNING: %s%s is too large to store.' % ( |
| 303 | self.base_path, archive_obj.infolist()[0].filename)) |
| 304 | archive_obj.close() |
| 305 | os.unlink(archive_path) |
| 306 | return_value = False |
| 307 | else: |
| 308 | archive_obj.close() |
| 309 | self.RemoveLastFile( |
| 310 | os.path.join(self.output_dir, self.current_archive)) |
| 311 | print 'Final archive size for %s is %i' % ( |
| 312 | self.current_archive, os.path.getsize(archive_path)) |
| 313 | return_value = True |
| 314 | return return_value |
| 315 | |
| 316 | def AddFileToArchive(self, filepath, compress_bit): |
| 317 | """Add the file at filepath to the current archive. |
| 318 | |
| 319 | Args: |
| 320 | filepath: A string, the path of the file to add. |
| 321 | compress_bit: A boolean, whether or not this file should be compressed |
| 322 | when added. |
| 323 | |
| 324 | Returns: |
| 325 | True if the file could be added (typically because this is a file) or |
| 326 | False if it couldn't be added (typically because its a directory). |
| 327 | """ |
| 328 | curr_archive_path = os.path.join(self.output_dir, self.current_archive) |
| 329 | if os.path.isfile(filepath) and not os.path.islink(filepath): |
| 330 | if os.path.getsize(filepath) > 1048576: |
| 331 | print 'Warning: %s is potentially too large to serve on GAE' % filepath |
| 332 | archive = self.OpenZipFileAtPath(curr_archive_path, |
| 333 | compress=compress_bit) |
| 334 | # Add the file to the archive. |
| 335 | archive.write(filepath, filepath[len(self.base_path):]) |
| 336 | archive.close() |
| 337 | return True |
| 338 | else: |
| 339 | return False |
| 340 | |
| 341 | def ArchiveIsValid(self): |
| 342 | """Check whether the archive is valid. |
| 343 | |
| 344 | Currently this only checks whether the archive is under the required size. |
| 345 | The thought is that eventually this will do additional validation |
| 346 | |
| 347 | Returns: |
| 348 | True if the archive is valid, False if its not. |
| 349 | """ |
| 350 | archive_path = os.path.join(self.output_dir, self.current_archive) |
| 351 | return os.path.getsize(archive_path) <= self.max_size |
| 352 | |
| 353 | |
| 354 | def main(argv): |
| 355 | parser = CreateOptionsParser() |
| 356 | (options, unused_args) = parser.parse_args(args=argv[1:]) |
| 357 | VerifyArguments(options, parser) |
| 358 | zipper = DirectoryZipper(options.destination, |
| 359 | options.sourcefiles, |
| 360 | ParseSize(options.filesize), |
| 361 | options.compress) |
| 362 | zipper.StartCompress() |
| 363 | |
| 364 | |
| 365 | if __name__ == '__main__': |
| 366 | main(sys.argv) |