普通文本  |  393行  |  16.25 KB

#!/usr/bin/python
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import argparse
import datetime
import logging
import os
import re
import sys
import time

os.environ['DJANGO_SETTINGS_MODULE'] = 'frontend.settings'

import common
from autotest_lib.server import utils
from django.db import connections, transaction


# Format Appears as: [Date] [Time] - [Msg Level] - [Message]
LOGGING_FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
# This regex makes sure the input is in the format of YYYY-MM-DD (2012-02-01)
DATE_FORMAT_REGEX = ('^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]'
                     '|3[01])$')
SELECT_CMD_FORMAT = """
SELECT %(table)s.%(primary_key)s FROM %(table)s
WHERE %(table)s.%(time_column)s <= "%(date)s"
"""
SELECT_JOIN_CMD_FORMAT = """
SELECT %(table)s.%(primary_key)s FROM %(table)s
INNER JOIN %(related_table)s
  ON %(table)s.%(foreign_key)s=%(related_table)s.%(related_primary_key)s
WHERE %(related_table)s.%(time_column)s <= "%(date)s"
"""
SELECT_WITH_INDIRECTION_FORMAT = """
SELECT %(table)s.%(primary_key)s FROM %(table)s
INNER JOIN %(indirection_table)s
  ON %(table)s.%(foreign_key)s =
     %(indirection_table)s.%(indirection_primary_key)s
INNER JOIN %(related_table)s
  ON %(indirection_table)s.%(indirection_foreign_key)s =
  %(related_table)s.%(related_primary_key)s
WHERE %(related_table)s.%(time_column)s <= "%(date)s"
"""
DELETE_ROWS_FORMAT = """
DELETE FROM %(table)s
WHERE %(table)s.%(primary_key)s IN (%(rows)s)
"""


AFE_JOB_ID = 'afe_job_id'
JOB_ID = 'job_id'
JOB_IDX = 'job_idx'
TEST_IDX = 'test_idx'

# CAUTION: Make sure only the 'default' connection is used. Otherwise
# db_cleanup may delete stuff from the global database, which is generally not
# intended.
cursor = connections['default'].cursor()

# Globals for command line flag constants, for convenience.
DRY_RUN = False
STEP_SIZE = None
LOAD_RATIO = 1.0

class ProgressBar(object):
    TEXT = "{:<40s} [{:<20s}] ({:>9d}/{:>9d})"

    def __init__(self, name, amount):
        self._name = name
        self._amount = amount
        self._cur = 0

    def __enter__(self):
        return self

    def __exit__(self, a, b, c):
        sys.stdout.write('\n')
        sys.stdout.flush()

    def update(self, x):
        """
        Advance the counter by `x`.

        @param x: An integer of how many more elements were processed.
        """
        self._cur += x

    def show(self):
        """
        Display the progress bar on the current line.  Repeated invocations
        "update" the display.
        """
        if self._amount == 0:
            barlen = 20
        else:
            barlen = int(20 * self._cur / float(self._amount))
        if barlen:
            bartext = '=' * (barlen-1) + '>'
        else:
            bartext = ''
        text = self.TEXT.format(self._name, bartext, self._cur, self._amount)
        sys.stdout.write('\r')
        sys.stdout.write(text)
        sys.stdout.flush()


def grouper(iterable, n):
    """
    Group the elements of `iterable` into groups of maximum size `n`.

    @param iterable: An iterable.
    @param n: Max size of returned groups.
    @returns: Yields iterables of size <= n.

    >>> grouper('ABCDEFG', 3)
    [['A', 'B', C'], ['D', 'E', 'F'], ['G']]
    """
    args = [iter(iterable)] * n
    while True:
        lst = []
        try:
            for itr in args:
                lst.append(next(itr))
            yield lst
        except StopIteration:
            if lst:
                yield lst
            break


def _delete_table_data_before_date(table_to_delete_from, primary_key,
                                   related_table, related_primary_key,
                                   date, foreign_key=None,
                                   time_column="started_time",
                                   indirection_table=None,
                                   indirection_primary_key=None,
                                   indirection_foreign_key=None):
    """
    We want a delete statement that will only delete from one table while
    using a related table to find the rows to delete.

    An example mysql command:
    DELETE FROM tko_iteration_result USING tko_iteration_result INNER JOIN
    tko_tests WHERE tko_iteration_result.test_idx=tko_tests.test_idx AND
    tko_tests.started_time <= '2012-02-01';

    There are also tables that require 2 joins to determine which rows we want
    to delete and we determine these rows by joining the table we want to
    delete from with an indirection table to the actual jobs table.

    @param table_to_delete_from: Table whose rows we want to delete.
    @param related_table: Table with the date information we are selecting by.
    @param foreign_key: Foreign key used in table_to_delete_from to reference
                        the related table. If None, the primary_key is used.
    @param primary_key: Primary key in the related table.
    @param date: End date of the information we are trying to delete.
    @param time_column: Column that we want to use to compare the date to.
    @param indirection_table: Table we use to link the data we are trying to
                              delete with the table with the date information.
    @param indirection_primary_key: Key we use to connect the indirection table
                                    to the table we are trying to delete rows
                                    from.
    @param indirection_foreign_key: Key we use to connect the indirection table
                                    to the table with the date information.
    """
    if not foreign_key:
        foreign_key = primary_key

    if not related_table:
        # Deleting from a table directly.
        variables = dict(table=table_to_delete_from, primary_key=primary_key,
                         time_column=time_column, date=date)
        sql = SELECT_CMD_FORMAT % variables
    elif not indirection_table:
        # Deleting using a single JOIN to get the date information.
        variables = dict(primary_key=primary_key, table=table_to_delete_from,
                         foreign_key=foreign_key, related_table=related_table,
                         related_primary_key=related_primary_key,
                         time_column=time_column, date=date)
        sql = SELECT_JOIN_CMD_FORMAT % variables
    else:
        # There are cases where we need to JOIN 3 TABLES to determine the rows
        # we want to delete.
        variables = dict(primary_key=primary_key, table=table_to_delete_from,
                         indirection_table=indirection_table,
                         foreign_key=foreign_key,
                         indirection_primary_key=indirection_primary_key,
                         related_table=related_table,
                         related_primary_key=related_primary_key,
                         indirection_foreign_key=indirection_foreign_key,
                         time_column=time_column, date=date)
        sql = SELECT_WITH_INDIRECTION_FORMAT % variables

    logging.debug('SQL: %s', sql)
    cursor.execute(sql, [])
    rows = [x[0] for x in cursor.fetchall()]
    logging.debug(rows)

    if not rows or rows == [None]:
        with ProgressBar(table_to_delete_from, 0) as pb:
            pb.show()
        logging.debug('Nothing to delete for %s', table_to_delete_from)
        return

    with ProgressBar(table_to_delete_from, len(rows)) as pb:
        for row_keys in grouper(rows, STEP_SIZE):
            variables['rows'] = ','.join([str(x) for x in row_keys])
            sql = DELETE_ROWS_FORMAT % variables

            start = time.time()
            logging.debug('SQL: %s', sql)
            if not DRY_RUN:
                cursor.execute(sql, [])
                transaction.commit_unless_managed(using='default')
            end = time.time()

            pb.update(len(row_keys))
            pb.show()

            if LOAD_RATIO != 1.0:
                assert 0 < LOAD_RATIO <= 1, (
                        'Load ratio must be a fraction between 0 and 1.')
                time.sleep((end - start) / LOAD_RATIO)


def _subtract_days(date, days_to_subtract):
    """
    Return a date (string) that is 'days' before 'date'

    @param date: date (string) we are subtracting from.
    @param days_to_subtract: days (int) we are subtracting.
    """
    date_obj = datetime.datetime.strptime(date, '%Y-%m-%d')
    difference = date_obj - datetime.timedelta(days=days_to_subtract)
    return difference.strftime('%Y-%m-%d')


def _delete_all_data_before_date(date):
    """
    Delete all the database data before a given date.

    This function focuses predominately on the data for jobs in tko_jobs.
    However not all jobs in afe_jobs are also in tko_jobs.

    Therefore we delete all the afe_job and foreign key relations prior to two
    days before date. Then we do the queries using tko_jobs and these
    tables to ensure all the related information is gone. Even though we are
    repeating deletes on these tables, the second delete will be quick and
    completely thorough in ensuring we clean up all the foreign key
    dependencies correctly.

    @param date: End date of the information we are trying to delete.
    @param step: Rows to delete per SQL query.
    """
    # First cleanup all afe_job related data (prior to 2 days before date).
    # The reason for this is not all afe_jobs may be in tko_jobs.
    afe_date = _subtract_days(date, 2)
    logging.info('Cleaning up all afe_job data prior to %s.', afe_date)
    _delete_table_data_before_date('afe_aborted_host_queue_entries',
                                   'queue_entry_id',
                                   'afe_jobs', 'id', afe_date,
                                   time_column= 'created_on',
                                   foreign_key='queue_entry_id',
                                   indirection_table='afe_host_queue_entries',
                                   indirection_primary_key='id',
                                   indirection_foreign_key='job_id')
    _delete_table_data_before_date('afe_special_tasks', 'id',
                                   'afe_jobs', 'id',
                                   afe_date, time_column='created_on',
                                   foreign_key='queue_entry_id',
                                   indirection_table='afe_host_queue_entries',
                                   indirection_primary_key='id',
                                   indirection_foreign_key='job_id')
    _delete_table_data_before_date('afe_host_queue_entries', 'id',
                                   'afe_jobs', 'id',
                                   afe_date, time_column='created_on',
                                   foreign_key=JOB_ID)
    _delete_table_data_before_date('afe_job_keyvals', 'id',
                                   'afe_jobs', 'id',
                                   afe_date, time_column='created_on',
                                   foreign_key=JOB_ID)
    _delete_table_data_before_date('afe_jobs_dependency_labels', 'id',
                                   'afe_jobs', 'id',
                                   afe_date, time_column='created_on',
                                   foreign_key=JOB_ID)
    _delete_table_data_before_date('afe_jobs', 'id',
                                   None, None,
                                   afe_date, time_column='created_on')
    # Special tasks that aren't associated with an HQE
    # Since we don't do the queue_entry_id=NULL check, we might wipe out a bit
    # more than we should, but I doubt anyone will notice or care.
    _delete_table_data_before_date('afe_special_tasks', 'id',
                                   None, None,
                                   afe_date, time_column='time_requested')

    # Now go through and clean up all the rows related to tko_jobs prior to
    # date.
    logging.info('Cleaning up all data related to tko_jobs prior to %s.',
                  date)
    _delete_table_data_before_date('tko_test_attributes', 'id',
                                   'tko_tests', TEST_IDX,
                                   date, foreign_key=TEST_IDX)
    _delete_table_data_before_date('tko_test_labels_tests', 'id',
                                   'tko_tests', TEST_IDX,
                                   date, foreign_key= 'test_id')
    _delete_table_data_before_date('tko_iteration_result', TEST_IDX,
                                   'tko_tests', TEST_IDX,
                                   date)
    _delete_table_data_before_date('tko_iteration_perf_value', TEST_IDX,
                                   'tko_tests', TEST_IDX,
                                   date)
    _delete_table_data_before_date('tko_iteration_attributes', TEST_IDX,
                                   'tko_tests', TEST_IDX,
                                   date)
    _delete_table_data_before_date('tko_job_keyvals', 'id',
                                   'tko_jobs', JOB_IDX,
                                   date, foreign_key='job_id')
    _delete_table_data_before_date('afe_aborted_host_queue_entries',
                                   'queue_entry_id',
                                   'tko_jobs', AFE_JOB_ID, date,
                                   foreign_key='queue_entry_id',
                                   indirection_table='afe_host_queue_entries',
                                   indirection_primary_key='id',
                                   indirection_foreign_key='job_id')
    _delete_table_data_before_date('afe_special_tasks', 'id',
                                   'tko_jobs', AFE_JOB_ID,
                                   date, foreign_key='queue_entry_id',
                                   indirection_table='afe_host_queue_entries',
                                   indirection_primary_key='id',
                                   indirection_foreign_key='job_id')
    _delete_table_data_before_date('afe_host_queue_entries', 'id',
                                   'tko_jobs', AFE_JOB_ID,
                                   date, foreign_key='job_id')
    _delete_table_data_before_date('afe_job_keyvals', 'id',
                                   'tko_jobs', AFE_JOB_ID,
                                   date, foreign_key='job_id')
    _delete_table_data_before_date('afe_jobs_dependency_labels', 'id',
                                   'tko_jobs', AFE_JOB_ID,
                                   date, foreign_key='job_id')
    _delete_table_data_before_date('afe_jobs', 'id',
                                   'tko_jobs', AFE_JOB_ID,
                                   date, foreign_key='id')
    _delete_table_data_before_date('tko_tests', TEST_IDX,
                                   'tko_jobs', JOB_IDX,
                                   date, foreign_key=JOB_IDX)
    _delete_table_data_before_date('tko_jobs', JOB_IDX,
                                   None, None, date)


def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser()
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='Print SQL commands and results')
    parser.add_argument('--step', type=int, action='store',
                        default=1000,
                        help='Number of rows to delete at once')
    parser.add_argument('--dry_run', action='store_true',
                        help='Print SQL queries instead of executing them.')
    parser.add_argument('--load_ratio', type=float, action='store', default=0.2,
                        help=('The fraction of time the script should be '
                              'performing deletes. For example --load_ratio=.2 '
                              'will cause the script to sleep 80% of the time, '
                              'and perform work for the other 20%.'))
    parser.add_argument('date', help='Keep results newer than')
    return parser.parse_args()


def main():
    args = parse_args()

    verbose = args.verbose or args.dry_run
    level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(level=level, format=LOGGING_FORMAT)
    logging.info('Calling: %s', sys.argv)

    if not re.match(DATE_FORMAT_REGEX, args.date):
        print 'DATE must be in yyyy-mm-dd format!'
        return

    global STEP_SIZE, DRY_RUN, LOAD_RATIO
    STEP_SIZE = args.step
    DRY_RUN = args.dry_run
    LOAD_RATIO = args.load_ratio

    _delete_all_data_before_date(args.date)


if __name__ == '__main__':
    main()