blob: d369be4a4e07a9d3e004374e7536901e8d0820c1 [file] [log] [blame]
#!/usr/bin/python2.4
#
# Copyright (C) 2008 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Module to compress directories in to series of zip files.
This module will take a directory and compress all its contents, including
child directories into a series of zip files named N.zip where 'N' ranges from
0 to infinity. The zip files will all be below a certain specified maximum
threshold.
The directory is compressed with a depth first traversal, each directory's
file contents being compressed as it is visisted, before the compression of any
child directory's contents. In this way the files within an archive are ordered
and the archives themselves are ordered.
The class also constructs a 'main.py' file intended for use with Google App
Engine with a custom App Engine program not currently distributed with this
code base. The custom App Engine runtime can leverage the index files written
out by this class to more quickly locate which zip file to serve a given URL
from.
"""
__author__ = 'jmatt@google.com (Justin Mattson)'
from optparse import OptionParser
import os
import stat
import sys
import zipfile
from zipfile import ZipFile
import divide_and_compress_constants
def Main(argv):
parser = CreateOptionsParser()
(options, args) = parser.parse_args()
VerifyArguments(options, parser)
zipper = DirectoryZipper(options.destination,
options.sourcefiles,
ParseSize(options.filesize),
options.compress)
zipper.StartCompress()
def CreateOptionsParser():
rtn = OptionParser()
rtn.add_option('-s', '--sourcefiles', dest='sourcefiles', default=None,
help='The directory containing the files to compress')
rtn.add_option('-d', '--destination', dest='destination', default=None,
help=('Where to put the archive files, this should not be'
' a child of where the source files exist.'))
rtn.add_option('-f', '--filesize', dest='filesize', default='1M',
help=('Maximum size of archive files. A number followed by'
'a magnitude indicator, eg. 1000000B == one million '
'BYTES, 500K == five hundred KILOBYTES, 1.2M == one '
'point two MEGABYTES. 1M == 1048576 BYTES'))
rtn.add_option('-n', '--nocompress', action='store_false', dest='compress',
default=True,
help=('Whether the archive files should be compressed, or '
'just a concatenation of the source files'))
return rtn
def VerifyArguments(options, parser):
try:
if options.sourcefiles is None or options.destination is None:
parser.print_help()
sys.exit(-1)
except (AttributeError), err:
parser.print_help()
sys.exit(-1)
def ParseSize(size_str):
if len(size_str) < 2:
raise ValueError(('filesize argument not understood, please include'
' a numeric value and magnitude indicator'))
magnitude = size_str[len(size_str)-1:]
if not magnitude in ('K', 'B', 'M'):
raise ValueError(('filesize magnitude indicator not valid, must be \'K\','
'\'B\', or \'M\''))
numeral = float(size_str[0:len(size_str)-1])
if magnitude == 'K':
numeral *= 1024
elif magnitude == 'M':
numeral *= 1048576
return int(numeral)
class DirectoryZipper(object):
"""Class to compress a directory and all its sub-directories."""
current_archive = None
output_dir = None
base_path = None
max_size = None
compress = None
index_fp = None
def __init__(self, output_path, base_dir, archive_size, enable_compression):
"""DirectoryZipper constructor.
Args:
output_path: the path to write the archives and index file to
base_dir: the directory to compress
archive_size: the maximum size, in bytes, of a single archive file
enable_compression: whether or not compression should be enabled, if
disabled, the files will be written into an uncompresed zip
"""
self.output_dir = output_path
self.current_archive = '0.zip'
self.base_path = base_dir
self.max_size = archive_size
self.compress = enable_compression
def StartCompress(self):
"""Start compress of the directory.
This will start the compression process and write the archives to the
specified output directory. It will also produce an 'index.txt' file in the
output directory that maps from file to archive.
"""
self.index_fp = open(''.join([self.output_dir, 'main.py']), 'w')
self.index_fp.write(divide_and_compress_constants.file_preamble)
os.path.walk(self.base_path, self.CompressDirectory, 1)
self.index_fp.write(divide_and_compress_constants.file_endpiece)
self.index_fp.close()
def RemoveLastFile(self, archive_path=None):
"""Removes the last item in the archive.
This removes the last item in the archive by reading the items out of the
archive, adding them to a new archive, deleting the old archive, and
moving the new archive to the location of the old archive.
Args:
archive_path: Path to the archive to modify. This archive should not be
open elsewhere, since it will need to be deleted.
Return:
A new ZipFile object that points to the modified archive file
"""
if archive_path is None:
archive_path = ''.join([self.output_dir, self.current_archive])
# Move the old file and create a new one at its old location
ext_offset = archive_path.rfind('.')
old_archive = ''.join([archive_path[0:ext_offset], '-old',
archive_path[ext_offset:]])
os.rename(archive_path, old_archive)
old_fp = self.OpenZipFileAtPath(old_archive, mode='r')
if self.compress:
new_fp = self.OpenZipFileAtPath(archive_path,
mode='w',
compress=zipfile.ZIP_DEFLATED)
else:
new_fp = self.OpenZipFileAtPath(archive_path,
mode='w',
compress=zipfile.ZIP_STORED)
# Read the old archive in a new archive, except the last one
zip_members = enumerate(old_fp.infolist())
num_members = len(old_fp.infolist())
while num_members > 1:
this_member = zip_members.next()[1]
new_fp.writestr(this_member.filename, old_fp.read(this_member.filename))
num_members -= 1
# Close files and delete the old one
old_fp.close()
new_fp.close()
os.unlink(old_archive)
def OpenZipFileAtPath(self, path, mode=None, compress=zipfile.ZIP_DEFLATED):
"""This method is mainly for testing purposes, eg dependency injection."""
if mode is None:
if os.path.exists(path):
mode = 'a'
else:
mode = 'w'
if mode == 'r':
return ZipFile(path, mode)
else:
return ZipFile(path, mode, compress)
def CompressDirectory(self, irrelevant, dir_path, dir_contents):
"""Method to compress the given directory.
This method compresses the directory 'dir_path'. It will add to an existing
zip file that still has space and create new ones as necessary to keep zip
file sizes under the maximum specified size. This also writes out the
mapping of files to archives to the self.index_fp file descriptor
Args:
irrelevant: a numeric identifier passed by the os.path.walk method, this
is not used by this method
dir_path: the path to the directory to compress
dir_contents: a list of directory contents to be compressed
"""
# construct the queue of files to be added that this method will use
# it seems that dir_contents is given in reverse alphabetical order,
# so put them in alphabetical order by inserting to front of the list
dir_contents.sort()
zip_queue = []
if dir_path[len(dir_path) - 1:] == os.sep:
for filename in dir_contents:
zip_queue.append(''.join([dir_path, filename]))
else:
for filename in dir_contents:
zip_queue.append(''.join([dir_path, os.sep, filename]))
compress_bit = zipfile.ZIP_DEFLATED
if not self.compress:
compress_bit = zipfile.ZIP_STORED
# zip all files in this directory, adding to existing archives and creating
# as necessary
while len(zip_queue) > 0:
target_file = zip_queue[0]
if os.path.isfile(target_file):
self.AddFileToArchive(target_file, compress_bit)
# see if adding the new file made our archive too large
if not self.ArchiveIsValid():
# IF fixing fails, the last added file was to large, skip it
# ELSE the current archive filled normally, make a new one and try
# adding the file again
if not self.FixArchive('SIZE'):
zip_queue.pop(0)
else:
self.current_archive = '%i.zip' % (
int(self.current_archive[
0:self.current_archive.rfind('.zip')]) + 1)
else:
# if this the first file in the archive, write an index record
self.WriteIndexRecord()
zip_queue.pop(0)
else:
zip_queue.pop(0)
def WriteIndexRecord(self):
"""Write an index record to the index file.
Only write an index record if this is the first file to go into archive
Returns:
True if an archive record is written, False if it isn't
"""
archive = self.OpenZipFileAtPath(
''.join([self.output_dir, self.current_archive]), 'r')
archive_index = archive.infolist()
if len(archive_index) == 1:
self.index_fp.write(
'[\'%s\', \'%s\'],\n' % (self.current_archive,
archive_index[0].filename))
archive.close()
return True
else:
archive.close()
return False
def FixArchive(self, problem):
"""Make the archive compliant.
Args:
problem: the reason the archive is invalid
Returns:
Whether the file(s) removed to fix the archive could conceivably be
in an archive, but for some reason can't be added to this one.
"""
archive_path = ''.join([self.output_dir, self.current_archive])
rtn_value = None
if problem == 'SIZE':
archive_obj = self.OpenZipFileAtPath(archive_path, mode='r')
num_archive_files = len(archive_obj.infolist())
# IF there is a single file, that means its too large to compress,
# delete the created archive
# ELSE do normal finalization
if num_archive_files == 1:
print ('WARNING: %s%s is too large to store.' % (
self.base_path, archive_obj.infolist()[0].filename))
archive_obj.close()
os.unlink(archive_path)
rtn_value = False
else:
self.RemoveLastFile(''.join([self.output_dir, self.current_archive]))
archive_obj.close()
print 'Final archive size for %s is %i' % (
self.current_archive, os.stat(archive_path)[stat.ST_SIZE])
rtn_value = True
return rtn_value
def AddFileToArchive(self, filepath, compress_bit):
"""Add the file at filepath to the current archive.
Args:
filepath: the path of the file to add
compress_bit: whether or not this fiel should be compressed when added
Returns:
True if the file could be added (typically because this is a file) or
False if it couldn't be added (typically because its a directory)
"""
curr_archive_path = ''.join([self.output_dir, self.current_archive])
if os.path.isfile(filepath):
if os.stat(filepath)[stat.ST_SIZE] > 1048576:
print 'Warning: %s is potentially too large to serve on GAE' % filepath
archive = self.OpenZipFileAtPath(curr_archive_path,
compress=compress_bit)
# add the file to the archive
archive.write(filepath, filepath[len(self.base_path):])
archive.close()
return True
else:
return False
def ArchiveIsValid(self):
"""Check whether the archive is valid.
Currently this only checks whether the archive is under the required size.
The thought is that eventually this will do additional validation
Returns:
True if the archive is valid, False if its not
"""
archive_path = ''.join([self.output_dir, self.current_archive])
if os.stat(archive_path)[stat.ST_SIZE] > self.max_size:
return False
else:
return True
if __name__ == '__main__':
Main(sys.argv)