#!/usr/bin/python2.4
#
# Copyright (C) 2008 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Module to compress directories in to series of zip files.
This module will take a directory and compress all its contents, including
child directories into a series of zip files named N.zip where 'N' ranges from
0 to infinity. The zip files will all be below a certain specified maximum
threshold.
The directory is compressed with a depth first traversal, each directory's
file contents being compressed as it is visisted, before the compression of any
child directory's contents. In this way the files within an archive are ordered
and the archives themselves are ordered.
The class also constructs a 'main.py' file intended for use with Google App
Engine with a custom App Engine program not currently distributed with this
code base. The custom App Engine runtime can leverage the index files written
out by this class to more quickly locate which zip file to serve a given URL
from.
"""
__author__ = 'jmatt@google.com (Justin Mattson)'
import optparse
import os
import stat
import sys
import zipfile
import divide_and_compress_constants
def CreateOptionsParser():
"""Creates the parser for command line arguments.
Returns:
A configured optparse.OptionParser object.
"""
rtn = optparse.OptionParser()
rtn.add_option('-s', '--sourcefiles', dest='sourcefiles', default=None,
help='The directory containing the files to compress')
rtn.add_option('-d', '--destination', dest='destination', default=None,
help=('Where to put the archive files, this should not be'
' a child of where the source files exist.'))
rtn.add_option('-f', '--filesize', dest='filesize', default='1M',
help=('Maximum size of archive files. A number followed by '
'a magnitude indicator either "B", "K", "M", or "G". '
'Examples:\n 1000000B == one million BYTES\n'
' 1.2M == one point two MEGABYTES\n'
' 1M == 1048576 BYTES'))
rtn.add_option('-n', '--nocompress', action='store_false', dest='compress',
default=True,
help=('Whether the archive files should be compressed, or '
'just a concatenation of the source files'))
return rtn
def VerifyArguments(options, parser):
"""Runs simple checks on correctness of commandline arguments.
Args:
options: The command line options passed.
parser: The parser object used to parse the command string.
"""
try:
if options.sourcefiles is None or options.destination is None:
parser.print_help()
sys.exit(-1)
except AttributeError:
parser.print_help()
sys.exit(-1)
def ParseSize(size_str):
"""Parse the file size argument from a string to a number of bytes.
Args:
size_str: The string representation of the file size.
Returns:
The file size in bytes.
Raises:
ValueError: Raises an error if the numeric or qualifier portions of the
file size argument is invalid.
"""
if len(size_str) < 2:
raise ValueError(('filesize argument not understood, please include'
' a numeric value and magnitude indicator'))
magnitude = size_str[-1]
if not magnitude in ('B', 'K', 'M', 'G'):
raise ValueError(('filesize magnitude indicator not valid, must be "B",'
'"K","M", or "G"'))
numeral = float(size_str[:-1])
if magnitude == 'K':
numeral *= 1024
elif magnitude == 'M':
numeral *= 1048576
elif magnitude == 'G':
numeral *= 1073741824
return int(numeral)
class DirectoryZipper(object):
"""Class to compress a directory and all its sub-directories."""
def __init__(self, output_path, base_dir, archive_size, enable_compression):
"""DirectoryZipper constructor.
Args:
output_path: A string, the path to write the archives and index file to.
base_dir: A string, the directory to compress.
archive_size: An number, the maximum size, in bytes, of a single
archive file.
enable_compression: A boolean, whether or not compression should be
enabled, if disabled, the files will be written into an uncompresed
zip.
"""
self.output_dir = output_path
self.current_archive = '0.zip'
self.base_path = base_dir
self.max_size = archive_size
self.compress = enable_compression
# Set index_fp to None, because we don't know what it will be yet.
self.index_fp = None
def StartCompress(self):
"""Start compress of the directory.
This will start the compression process and write the archives to the
specified output directory. It will also produce an 'index.txt' file in the
output directory that maps from file to archive.
"""
self.index_fp = open(os.path.join(self.output_dir, 'main.py'), 'w')
self.index_fp.write(divide_and_compress_constants.file_preamble)
os.path.walk(self.base_path, self.CompressDirectory, 1)
self.index_fp.write(divide_and_compress_constants.file_endpiece)
self.index_fp.close()
def RemoveLastFile(self, archive_path=None):
"""Removes the last item in the archive.
This removes the last item in the archive by reading the items out of the
archive, adding them to a new archive, deleting the old archive, and
moving the new archive to the location of the old archive.
Args:
archive_path: Path to the archive to modify. This archive should not be
open elsewhere, since it will need to be deleted.
Returns:
A new ZipFile object that points to the modified archive file.
"""
if archive_path is None:
archive_path = os.path.join(self.output_dir, self.current_archive)
# Move the old file and create a new one at its old location.
root, ext = os.path.splitext(archive_path)
old_archive = ''.join([root, '-old', ext])
os.rename(archive_path, old_archive)
old_fp = self.OpenZipFileAtPath(old_archive, mode='r')
# By default, store uncompressed.
compress_bit = zipfile.ZIP_STORED
if self.compress:
compress_bit = zipfile.ZIP_DEFLATED
new_fp = self.OpenZipFileAtPath(archive_path,
mode='w',
compress=compress_bit)
# Read the old archive in a new archive, except the last one.
for zip_member in old_fp.infolist()[:-1]:
new_fp.writestr(zip_member, old_fp.read(zip_member.filename))
# Close files and delete the old one.
old_fp.close()
new_fp.close()
os.unlink(old_archive)
def OpenZipFileAtPath(self, path, mode=None, compress=zipfile.ZIP_DEFLATED):
"""This method is mainly for testing purposes, eg dependency injection."""
if mode is None:
if os.path.exists(path):
mode = 'a'
else:
mode = 'w'
if mode == 'r':
return zipfile.ZipFile(path, mode)
else:
return zipfile.ZipFile(path, mode, compress)
def CompressDirectory(self, unused_id, dir_path, dir_contents):
"""Method to compress the given directory.
This method compresses the directory 'dir_path'. It will add to an existing
zip file that still has space and create new ones as necessary to keep zip
file sizes under the maximum specified size. This also writes out the
mapping of files to archives to the self.index_fp file descriptor
Args:
unused_id: A numeric identifier passed by the os.path.walk method, this
is not used by this method.
dir_path: A string, the path to the directory to compress.
dir_contents: A list of directory contents to be compressed.
"""
# Construct the queue of files to be added that this method will use
# it seems that dir_contents is given in reverse alphabetical order,
# so put them in alphabetical order by inserting to front of the list.
dir_contents.sort()
zip_queue = []
for filename in dir_contents:
zip_queue.append(os.path.join(dir_path, filename))
compress_bit = zipfile.ZIP_DEFLATED
if not self.compress:
compress_bit = zipfile.ZIP_STORED
# Zip all files in this directory, adding to existing archives and creating
# as necessary.
while zip_queue:
target_file = zip_queue[0]
if os.path.isfile(target_file):
self.AddFileToArchive(target_file, compress_bit)
# See if adding the new file made our archive too large.
if not self.ArchiveIsValid():
# IF fixing fails, the last added file was to large, skip it
# ELSE the current archive filled normally, make a new one and try
# adding the file again.
if not self.FixArchive('SIZE'):
zip_queue.pop(0)
else:
self.current_archive = '%i.zip' % (
int(self.current_archive[
0:self.current_archive.rfind('.zip')]) + 1)
else:
# Write an index record if necessary.
self.WriteIndexRecord()
zip_queue.pop(0)
else:
zip_queue.pop(0)
def WriteIndexRecord(self):
"""Write an index record to the index file.
Only write an index record if this is the first file to go into archive
Returns:
True if an archive record is written, False if it isn't.
"""
archive = self.OpenZipFileAtPath(
os.path.join(self.output_dir, self.current_archive), 'r')
archive_index = archive.infolist()
if len(archive_index) == 1:
self.index_fp.write(
'[\'%s\', \'%s\'],\n' % (self.current_archive,
archive_index[0].filename))
archive.close()
return True
else:
archive.close()
return False
def FixArchive(self, problem):
"""Make the archive compliant.
Args:
problem: An enum, the reason the archive is invalid.
Returns:
Whether the file(s) removed to fix the archive could conceivably be
in an archive, but for some reason can't be added to this one.
"""
archive_path = os.path.join(self.output_dir, self.current_archive)
return_value = None
if problem == 'SIZE':
archive_obj = self.OpenZipFileAtPath(archive_path, mode='r')
num_archive_files = len(archive_obj.infolist())
# IF there is a single file, that means its too large to compress,
# delete the created archive
# ELSE do normal finalization.
if num_archive_files == 1:
print ('WARNING: %s%s is too large to store.' % (
self.base_path, archive_obj.infolist()[0].filename))
archive_obj.close()
os.unlink(archive_path)
return_value = False
else:
archive_obj.close()
self.RemoveLastFile(
os.path.join(self.output_dir, self.current_archive))
print 'Final archive size for %s is %i' % (
self.current_archive, os.path.getsize(archive_path))
return_value = True
return return_value
def AddFileToArchive(self, filepath, compress_bit):
"""Add the file at filepath to the current archive.
Args:
filepath: A string, the path of the file to add.
compress_bit: A boolean, whether or not this file should be compressed
when added.
Returns:
True if the file could be added (typically because this is a file) or
False if it couldn't be added (typically because its a directory).
"""
curr_archive_path = os.path.join(self.output_dir, self.current_archive)
if os.path.isfile(filepath) and not os.path.islink(filepath):
if os.path.getsize(filepath) > 1048576:
print 'Warning: %s is potentially too large to serve on GAE' % filepath
archive = self.OpenZipFileAtPath(curr_archive_path,
compress=compress_bit)
# Add the file to the archive.
archive.write(filepath, filepath[len(self.base_path):])
archive.close()
return True
else:
return False
def ArchiveIsValid(self):
"""Check whether the archive is valid.
Currently this only checks whether the archive is under the required size.
The thought is that eventually this will do additional validation
Returns:
True if the archive is valid, False if its not.
"""
archive_path = os.path.join(self.output_dir, self.current_archive)
return os.path.getsize(archive_path) <= self.max_size
def main(argv):
parser = CreateOptionsParser()
(options, unused_args) = parser.parse_args(args=argv[1:])
VerifyArguments(options, parser)
zipper = DirectoryZipper(options.destination,
options.sourcefiles,
ParseSize(options.filesize),
options.compress)
zipper.StartCompress()
if __name__ == '__main__':
main(sys.argv)