# -*- coding: utf-8 -*- # Copyright 2014 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Implementation of hash command for calculating hashes of local files.""" from hashlib import md5 import os import crcmod from gslib.command import Command from gslib.command_argument import CommandArgument from gslib.cs_api_map import ApiSelector from gslib.exception import CommandException from gslib.hashing_helper import Base64EncodeHash from gslib.hashing_helper import CalculateHashesFromContents from gslib.hashing_helper import SLOW_CRCMOD_WARNING from gslib.progress_callback import ConstructAnnounceText from gslib.progress_callback import FileProgressCallbackHandler from gslib.progress_callback import ProgressCallbackWithBackoff from gslib.storage_url import StorageUrlFromString from gslib.util import NO_MAX from gslib.util import UsingCrcmodExtension _SYNOPSIS = """ gsutil [-c] [-h] [-m] hash filename... """ _DETAILED_HELP_TEXT = (""" <B>SYNOPSIS</B> """ + _SYNOPSIS + """ <B>DESCRIPTION</B> The hash command calculates hashes on a local file that can be used to compare with gsutil ls -L output. If a specific hash option is not provided, this command calculates all gsutil-supported hashes for the file. Note that gsutil automatically performs hash validation when uploading or downloading files, so this command is only needed if you want to write a script that separately checks the hash for some reason. If you calculate a CRC32c hash for the file without a precompiled crcmod installation, hashing will be very slow. See "gsutil help crcmod" for details. <B>OPTIONS</B> -c Calculate a CRC32c hash for the file. -h Output hashes in hex format. By default, gsutil uses base64. -m Calculate a MD5 hash for the file. """) class HashCommand(Command): """Implementation of gsutil hash command.""" # Command specification. See base class for documentation. command_spec = Command.CreateCommandSpec( 'hash', command_name_aliases=[], usage_synopsis=_SYNOPSIS, min_args=1, max_args=NO_MAX, supported_sub_args='chm', file_url_ok=True, provider_url_ok=False, urls_start_arg=0, gs_api_support=[ApiSelector.JSON], gs_default_api=ApiSelector.JSON, argparse_arguments=[ CommandArgument.MakeZeroOrMoreFileURLsArgument() ] ) # Help specification. See help_provider.py for documentation. help_spec = Command.HelpSpec( help_name='hash', help_name_aliases=['checksum'], help_type='command_help', help_one_line_summary='Calculate file hashes', help_text=_DETAILED_HELP_TEXT, subcommand_help_text={}, ) @classmethod def _ParseOpts(cls, sub_opts, logger): """Returns behavior variables based on input options. Args: sub_opts: getopt sub-arguments for the command. logger: logging.Logger for the command. Returns: Tuple of calc_crc32c: Boolean, if True, command should calculate a CRC32c checksum. calc_md5: Boolean, if True, command should calculate an MD5 hash. format_func: Function used for formatting the hash in the desired format. output_format: String describing the hash output format. """ calc_crc32c = False calc_md5 = False format_func = lambda digest: Base64EncodeHash(digest.hexdigest()) found_hash_option = False output_format = 'base64' if sub_opts: for o, unused_a in sub_opts: if o == '-c': calc_crc32c = True found_hash_option = True elif o == '-h': output_format = 'hex' format_func = lambda digest: digest.hexdigest() elif o == '-m': calc_md5 = True found_hash_option = True if not found_hash_option: calc_crc32c = True calc_md5 = True if calc_crc32c and not UsingCrcmodExtension(crcmod): logger.warn(SLOW_CRCMOD_WARNING) return calc_crc32c, calc_md5, format_func, output_format def _GetHashClassesFromArgs(self, calc_crc32c, calc_md5): """Constructs the dictionary of hashes to compute based on the arguments. Args: calc_crc32c: If True, CRC32c should be included. calc_md5: If True, MD5 should be included. Returns: Dictionary of {string: hash digester}, where string the name of the digester algorithm. """ hash_dict = {} if calc_crc32c: hash_dict['crc32c'] = crcmod.predefined.Crc('crc-32c') if calc_md5: hash_dict['md5'] = md5() return hash_dict def RunCommand(self): """Command entry point for the hash command.""" (calc_crc32c, calc_md5, format_func, output_format) = ( self._ParseOpts(self.sub_opts, self.logger)) matched_one = False for url_str in self.args: if not StorageUrlFromString(url_str).IsFileUrl(): raise CommandException('"hash" command requires a file URL') for file_ref in self.WildcardIterator(url_str).IterObjects(): matched_one = True file_name = file_ref.storage_url.object_name file_size = os.path.getsize(file_name) callback_processor = ProgressCallbackWithBackoff( file_size, FileProgressCallbackHandler( ConstructAnnounceText('Hashing', file_name), self.logger).call) hash_dict = self._GetHashClassesFromArgs(calc_crc32c, calc_md5) with open(file_name, 'rb') as fp: CalculateHashesFromContents(fp, hash_dict, callback_processor=callback_processor) print 'Hashes [%s] for %s:' % (output_format, file_name) for name, digest in hash_dict.iteritems(): print '\tHash (%s):\t\t%s' % (name, format_func(digest)) if not matched_one: raise CommandException('No files matched') return 0