# -*- coding: utf-8 -*-
# Copyright 2013 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of Unix-like du command for cloud storage providers."""
from __future__ import absolute_import
import sys
from gslib.boto_translation import S3_DELETE_MARKER_GUID
from gslib.bucket_listing_ref import BucketListingObject
from gslib.command import Command
from gslib.command_argument import CommandArgument
from gslib.cs_api_map import ApiSelector
from gslib.exception import CommandException
from gslib.ls_helper import LsHelper
from gslib.storage_url import ContainsWildcard
from gslib.storage_url import StorageUrlFromString
from gslib.util import MakeHumanReadable
from gslib.util import NO_MAX
from gslib.util import UTF8
_SYNOPSIS = """
gsutil du url...
"""
_DETAILED_HELP_TEXT = ("""
<B>SYNOPSIS</B>
""" + _SYNOPSIS + """
<B>DESCRIPTION</B>
The du command displays the amount of space (in bytes) being used by the
objects in the file or object hierarchy under a given URL. The syntax emulates
the Linux du command (which stands for disk usage). For example, the command:
gsutil du -s gs://your-bucket/dir
will report the total space used by all objects under gs://your-bucket/dir and
any sub-directories.
<B>OPTIONS</B>
-0 Ends each output line with a 0 byte rather than a newline. This
can be useful to make the output more easily machine-readable.
-a Includes non-current object versions / generations in the listing
(only useful with a versioning-enabled bucket). Also prints
generation and metageneration for each listed object.
-c Produce a grand total.
-e A pattern to exclude from reporting. Example: -e "*.o" would
exclude any object that ends in ".o". Can be specified multiple
times.
-h Prints object sizes in human-readable format (e.g., 1 KiB,
234 MiB, 2GiB, etc.)
-s Display only a summary total for each argument.
-X Similar to -e, but excludes patterns from the given file. The
patterns to exclude should be one per line.
<B>EXAMPLES</B>
To list the size of all objects in a bucket:
gsutil du gs://bucketname
To list the size of all objects underneath a prefix:
gsutil du gs://bucketname/prefix/*
To print the total number of bytes in a bucket, in human-readable form:
gsutil du -ch gs://bucketname
To see a summary of the total bytes in the two given buckets:
gsutil du -s gs://bucket1 gs://bucket2
To list the size of all objects in a versioned bucket, including objects that
are not the latest:
gsutil du -a gs://bucketname
To list all objects in a bucket, except objects that end in ".bak",
with each object printed ending in a null byte:
gsutil du -e "*.bak" -0 gs://bucketname
To get a total of all buckets in a project with a grand total for an entire
project:
gsutil -o GSUtil:default_project_id=project-name du -shc
""")
class DuCommand(Command):
"""Implementation of gsutil du command."""
# Command specification. See base class for documentation.
command_spec = Command.CreateCommandSpec(
'du',
command_name_aliases=[],
usage_synopsis=_SYNOPSIS,
min_args=0,
max_args=NO_MAX,
supported_sub_args='0ace:hsX:',
file_url_ok=False,
provider_url_ok=True,
urls_start_arg=0,
gs_api_support=[ApiSelector.XML, ApiSelector.JSON],
gs_default_api=ApiSelector.JSON,
argparse_arguments=[
CommandArgument.MakeZeroOrMoreCloudURLsArgument()
]
)
# Help specification. See help_provider.py for documentation.
help_spec = Command.HelpSpec(
help_name='du',
help_name_aliases=[],
help_type='command_help',
help_one_line_summary='Display object size usage',
help_text=_DETAILED_HELP_TEXT,
subcommand_help_text={},
)
def _PrintSummaryLine(self, num_bytes, name):
size_string = (MakeHumanReadable(num_bytes)
if self.human_readable else str(num_bytes))
sys.stdout.write('%(size)-10s %(name)s%(ending)s' % {
'size': size_string, 'name': name, 'ending': self.line_ending})
def _PrintInfoAboutBucketListingRef(self, bucket_listing_ref):
"""Print listing info for given bucket_listing_ref.
Args:
bucket_listing_ref: BucketListing being listed.
Returns:
Tuple (number of objects, object size)
Raises:
Exception: if calling bug encountered.
"""
obj = bucket_listing_ref.root_object
url_str = bucket_listing_ref.url_string
if (obj.metadata and S3_DELETE_MARKER_GUID in
obj.metadata.additionalProperties):
size_string = '0'
num_bytes = 0
num_objs = 0
url_str += '<DeleteMarker>'
else:
size_string = (MakeHumanReadable(obj.size)
if self.human_readable else str(obj.size))
num_bytes = obj.size
num_objs = 1
if not self.summary_only:
sys.stdout.write('%(size)-10s %(url)s%(ending)s' % {
'size': size_string,
'url': url_str.encode(UTF8),
'ending': self.line_ending})
return (num_objs, num_bytes)
def RunCommand(self):
"""Command entry point for the du command."""
self.line_ending = '\n'
self.all_versions = False
self.produce_total = False
self.human_readable = False
self.summary_only = False
self.exclude_patterns = []
if self.sub_opts:
for o, a in self.sub_opts:
if o == '-0':
self.line_ending = '\0'
elif o == '-a':
self.all_versions = True
elif o == '-c':
self.produce_total = True
elif o == '-e':
self.exclude_patterns.append(a)
elif o == '-h':
self.human_readable = True
elif o == '-s':
self.summary_only = True
elif o == '-X':
if a == '-':
f = sys.stdin
else:
f = open(a, 'r')
try:
for line in f:
line = line.strip()
if line:
self.exclude_patterns.append(line)
finally:
f.close()
if not self.args:
# Default to listing all gs buckets.
self.args = ['gs://']
total_bytes = 0
got_nomatch_errors = False
def _PrintObjectLong(blr):
return self._PrintInfoAboutBucketListingRef(blr)
def _PrintNothing(unused_blr=None):
pass
def _PrintDirectory(num_bytes, name):
if not self.summary_only:
self._PrintSummaryLine(num_bytes, name)
for url_arg in self.args:
top_level_storage_url = StorageUrlFromString(url_arg)
if top_level_storage_url.IsFileUrl():
raise CommandException('Only cloud URLs are supported for %s'
% self.command_name)
bucket_listing_fields = ['size']
ls_helper = LsHelper(
self.WildcardIterator, self.logger,
print_object_func=_PrintObjectLong, print_dir_func=_PrintNothing,
print_dir_header_func=_PrintNothing,
print_dir_summary_func=_PrintDirectory,
print_newline_func=_PrintNothing, all_versions=self.all_versions,
should_recurse=True, exclude_patterns=self.exclude_patterns,
fields=bucket_listing_fields)
# ls_helper expands to objects and prefixes, so perform a top-level
# expansion first.
if top_level_storage_url.IsProvider():
# Provider URL: use bucket wildcard to iterate over all buckets.
top_level_iter = self.WildcardIterator(
'%s://*' % top_level_storage_url.scheme).IterBuckets(
bucket_fields=['id'])
elif top_level_storage_url.IsBucket():
top_level_iter = self.WildcardIterator(
'%s://%s' % (top_level_storage_url.scheme,
top_level_storage_url.bucket_name)).IterBuckets(
bucket_fields=['id'])
else:
top_level_iter = [BucketListingObject(top_level_storage_url)]
for blr in top_level_iter:
storage_url = blr.storage_url
if storage_url.IsBucket() and self.summary_only:
storage_url = StorageUrlFromString(
storage_url.CreatePrefixUrl(wildcard_suffix='**'))
_, exp_objs, exp_bytes = ls_helper.ExpandUrlAndPrint(storage_url)
if (storage_url.IsObject() and exp_objs == 0 and
ContainsWildcard(url_arg) and not self.exclude_patterns):
got_nomatch_errors = True
total_bytes += exp_bytes
if self.summary_only:
self._PrintSummaryLine(exp_bytes, blr.url_string.rstrip('/'))
if self.produce_total:
self._PrintSummaryLine(total_bytes, 'total')
if got_nomatch_errors:
raise CommandException('One or more URLs matched no objects.')
return 0