# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. # This file contains utility functions for host_history. import collections import copy import multiprocessing.pool from itertools import groupby import common from autotest_lib.client.common_lib import time_utils from autotest_lib.client.common_lib.cros.graphite import autotest_es from autotest_lib.frontend import setup_django_environment from autotest_lib.frontend.afe import models from autotest_lib.site_utils import host_label_utils from autotest_lib.site_utils import job_history _HOST_HISTORY_TYPE = 'host_history' _LOCK_HISTORY_TYPE = 'lock_history' # The maximum number of days that the script will lookup for history. _MAX_DAYS_FOR_HISTORY = 90 class NoHostFoundException(Exception): """Exception raised when no host is found to search for history. """ def get_matched_hosts(board, pool): """Get duts with matching board and pool labels from metaDB. @param board: board of DUT, set to None if board doesn't need to match. @param pool: pool of DUT, set to None if pool doesn't need to match. @return: A list of duts that match the specified board and pool. """ labels = [] if pool: labels.append('pool:%s' % pool) if board: labels.append('board:%s' % board) host_labels = host_label_utils.get_host_labels(labels=labels) return host_labels.keys() def prepopulate_dict(keys, value, extras=None): """Creates a dictionary with val=value for each key. @param keys: list of keys @param value: the value of each entry in the dict. @param extras: list of additional keys @returns: dictionary """ result = collections.OrderedDict() extra_keys = tuple(extras if extras else []) for key in keys + extra_keys: result[key] = value return result def lock_history_to_intervals(initial_lock_val, t_start, t_end, lock_history): """Converts lock history into a list of intervals of locked times. @param initial_lock_val: Initial value of the lock (False or True) @param t_start: beginning of the time period we are interested in. @param t_end: end of the time period we are interested in. @param lock_history: Result of querying es for locks (dict) This dictionary should contain keys 'locked' and 'time_recorded' @returns: Returns a list of tuples where the elements of each tuples represent beginning and end of intervals of locked, respectively. """ locked_intervals = [] t_prev = t_start state_prev = initial_lock_val for entry in lock_history.hits: t_curr = entry['time_recorded'] #If it is locked, then we put into locked_intervals if state_prev: locked_intervals.append((t_prev, t_curr)) # update vars t_prev = t_curr state_prev = entry['locked'] if state_prev: locked_intervals.append((t_prev, t_end)) return locked_intervals def find_most_recent_entry_before(t, type_str, hostname, fields): """Returns the fields of the most recent entry before t. @param t: time we are interested in. @param type_str: _type in esdb, such as 'host_history' (string) @param hostname: hostname of DUT (string) @param fields: list of fields we are interested in @returns: time, field_value of the latest entry. """ # History older than 90 days are ignored. This helps the ES query faster. t_epoch = time_utils.to_epoch_time(t) result = autotest_es.query( fields_returned=fields, equality_constraints=[('_type', type_str), ('hostname', hostname)], range_constraints=[('time_recorded', t_epoch-3600*24*_MAX_DAYS_FOR_HISTORY, t_epoch)], size=1, sort_specs=[{'time_recorded': 'desc'}]) if result.total > 0: return result.hits[0] return {} def get_host_history_intervals(input): """Gets stats for a host. This method uses intervals found in metaDB to build a full history of the host. The intervals argument contains a list of metadata from querying ES for records between t_start and t_end. To get the status from t_start to the first record logged in ES, we need to look back to the last record logged in ES before t_start. @param input: A dictionary of input args, which including following args: t_start: beginning of time period we are interested in. t_end: end of time period we are interested in. hostname: hostname for the host we are interested in (string) intervals: intervals from ES query. @returns: dictionary, num_entries_found dictionary of status: time spent in that status num_entries_found: number of host history entries found in [t_start, t_end] """ t_start = input['t_start'] t_end = input['t_end'] hostname = input['hostname'] intervals = input['intervals'] lock_history_recent = find_most_recent_entry_before( t=t_start, type_str=_LOCK_HISTORY_TYPE, hostname=hostname, fields=['time_recorded', 'locked']) # I use [0] and [None] because lock_history_recent's type is list. t_lock = lock_history_recent.get('time_recorded', None) t_lock_val = lock_history_recent.get('locked', None) t_metadata = find_most_recent_entry_before( t=t_start, type_str=_HOST_HISTORY_TYPE, hostname=hostname, fields=None) t_host = t_metadata.pop('time_recorded', None) t_host_stat = t_metadata.pop('status', None) status_first = t_host_stat if t_host else 'Ready' t = min([t for t in [t_lock, t_host, t_start] if t]) t_epoch = time_utils.to_epoch_time(t) t_end_epoch = time_utils.to_epoch_time(t_end) lock_history_entries = autotest_es.query( fields_returned=['locked', 'time_recorded'], equality_constraints=[('_type', _LOCK_HISTORY_TYPE), ('hostname', hostname)], range_constraints=[('time_recorded', t_epoch, t_end_epoch)], sort_specs=[{'time_recorded': 'asc'}]) # Validate lock history. If an unlock event failed to be recorded in metadb, # lock history will show the dut being locked while host still has status # changed over the time. This check tries to remove the lock event in lock # history if: # 1. There is only one entry in lock_history_entries (it's a good enough # assumption to avoid the code being over complicated. # 2. The host status has changes after the lock history starts as locked. if (len(lock_history_entries.hits) == 1 and t_lock_val and len(intervals) >1): locked_intervals = None print ('Lock history of dut %s is ignored, the dut may have missing ' 'data in lock history in metadb. Try to lock and unlock the dut ' 'in AFE will force the lock history to be updated in metadb.' % hostname) else: locked_intervals = lock_history_to_intervals(t_lock_val, t, t_end, lock_history_entries) num_entries_found = len(intervals) t_prev = t_start status_prev = status_first metadata_prev = t_metadata intervals_of_statuses = collections.OrderedDict() for entry in intervals: metadata = entry.copy() t_curr = metadata.pop('time_recorded') status_curr = metadata.pop('status') intervals_of_statuses.update(calculate_status_times( t_prev, t_curr, status_prev, metadata_prev, locked_intervals)) # Update vars t_prev = t_curr status_prev = status_curr metadata_prev = metadata # Do final as well. intervals_of_statuses.update(calculate_status_times( t_prev, t_end, status_prev, metadata_prev, locked_intervals)) return hostname, intervals_of_statuses, num_entries_found def calculate_total_times(intervals_of_statuses): """Calculates total times in each status. @param intervals_of_statuses: ordereddict where key=(ti, tf) and val=status @returns: dictionary where key=status value=time spent in that status """ total_times = prepopulate_dict(models.Host.Status.names, 0.0, extras=['Locked']) for key, status_info in intervals_of_statuses.iteritems(): ti, tf = key total_times[status_info['status']] += tf - ti return total_times def aggregate_hosts(intervals_of_statuses_list): """Aggregates history of multiple hosts @param intervals_of_statuses_list: A list of dictionaries where keys are tuple (ti, tf), and value is the status along with other metadata. @returns: A dictionary where keys are strings, e.g. 'status' and value is total time spent in that status among all hosts. """ stats_all = prepopulate_dict(models.Host.Status.names, 0.0, extras=['Locked']) num_hosts = len(intervals_of_statuses_list) for intervals_of_statuses in intervals_of_statuses_list: total_times = calculate_total_times(intervals_of_statuses) for status, delta in total_times.iteritems(): stats_all[status] += delta return stats_all, num_hosts def get_stats_string_aggregate(labels, t_start, t_end, aggregated_stats, num_hosts): """Returns string reporting overall host history for a group of hosts. @param labels: A list of labels useful for describing the group of hosts these overall stats represent. @param t_start: beginning of time period we are interested in. @param t_end: end of time period we are interested in. @param aggregated_stats: A dictionary where keys are string, e.g. 'status' value is total time spent in that status among all hosts. @returns: string representing the aggregate stats report. """ result = 'Overall stats for hosts: %s \n' % (', '.join(labels)) result += ' %s - %s \n' % (time_utils.epoch_time_to_date_string(t_start), time_utils.epoch_time_to_date_string(t_end)) result += ' Number of total hosts: %s \n' % (num_hosts) # This is multiplied by time_spent to get percentage_spent multiplication_factor = 100.0 / ((t_end - t_start) * num_hosts) for status, time_spent in aggregated_stats.iteritems(): # Normalize by the total time we are interested in among ALL hosts. spaces = ' ' * (15 - len(status)) percent_spent = multiplication_factor * time_spent result += ' %s: %s %.2f %%\n' % (status, spaces, percent_spent) result += '- -- --- ---- ----- ---- --- -- -\n' return result def get_overall_report(label, t_start, t_end, intervals_of_statuses_list): """Returns string reporting overall host history for a group of hosts. @param label: A string that can be useful for showing what type group of hosts these overall stats represent. @param t_start: beginning of time period we are interested in. @param t_end: end of time period we are interested in. @param intervals_of_statuses_list: A list of dictionaries where keys are tuple (ti, tf), and value is the status along with other metadata, e.g., task_id, task_name, job_id etc. """ stats_all, num_hosts = aggregate_hosts( intervals_of_statuses_list) return get_stats_string_aggregate( label, t_start, t_end, stats_all, num_hosts) def get_intervals_for_host(t_start, t_end, hostname): """Gets intervals for the given. Query metaDB to return all intervals between given start and end time. Note that intervals found in metaDB may miss the history from t_start to the first interval found. @param t_start: beginning of time period we are interested in. @param t_end: end of time period we are interested in. @param hosts: A list of hostnames to look for history. @param board: Name of the board to look for history. Default is None. @param pool: Name of the pool to look for history. Default is None. @returns: A dictionary of hostname: intervals. """ t_start_epoch = time_utils.to_epoch_time(t_start) t_end_epoch = time_utils.to_epoch_time(t_end) host_history_entries = autotest_es.query( fields_returned=None, equality_constraints=[('_type', _HOST_HISTORY_TYPE), ('hostname', hostname)], range_constraints=[('time_recorded', t_start_epoch, t_end_epoch)], sort_specs=[{'time_recorded': 'asc'}]) return host_history_entries.hits def get_intervals_for_hosts(t_start, t_end, hosts=None, board=None, pool=None): """Gets intervals for given hosts or board/pool. Query metaDB to return all intervals between given start and end time. If a list of hosts is provided, the board and pool constraints are ignored. If hosts is set to None, and board or pool is set, this method will attempt to search host history with labels for all hosts, to help the search perform faster. If hosts, board and pool are all set to None, return intervals for all hosts. Note that intervals found in metaDB may miss the history from t_start to the first interval found. @param t_start: beginning of time period we are interested in. @param t_end: end of time period we are interested in. @param hosts: A list of hostnames to look for history. @param board: Name of the board to look for history. Default is None. @param pool: Name of the pool to look for history. Default is None. @returns: A dictionary of hostname: intervals. """ hosts_intervals = {} if hosts: for host in hosts: hosts_intervals[host] = get_intervals_for_host(t_start, t_end, host) else: hosts = get_matched_hosts(board, pool) if not hosts: raise NoHostFoundException('No host is found for board:%s, pool:%s.' % (board, pool)) equality_constraints=[('_type', _HOST_HISTORY_TYPE),] if board: equality_constraints.append(('labels', 'board:'+board)) if pool: equality_constraints.append(('labels', 'pool:'+pool)) t_start_epoch = time_utils.to_epoch_time(t_start) t_end_epoch = time_utils.to_epoch_time(t_end) results = autotest_es.query( equality_constraints=equality_constraints, range_constraints=[('time_recorded', t_start_epoch, t_end_epoch)], sort_specs=[{'hostname': 'asc'}]) results_group_by_host = {} for hostname,intervals_for_host in groupby(results.hits, lambda h: h['hostname']): results_group_by_host[hostname] = intervals_for_host for host in hosts: intervals = results_group_by_host.get(host, None) # In case the host's board or pool label was modified after # the last status change event was reported, we need to run a # separate query to get its history. That way the host's # history won't be shown as blank. if not intervals: intervals = get_intervals_for_host(t_start, t_end, host) hosts_intervals[host] = intervals return hosts_intervals def get_report(t_start, t_end, hosts=None, board=None, pool=None, print_each_interval=False): """Gets history for given hosts or board/pool If a list of hosts is provided, the board and pool constraints are ignored. @param t_start: beginning of time period we are interested in. @param t_end: end of time period we are interested in. @param hosts: A list of hostnames to look for history. @param board: Name of the board to look for history. Default is None. @param pool: Name of the pool to look for history. Default is None. @param print_each_interval: True display all intervals, default is False. @returns: stats report for this particular host. The report is a list of tuples (stat_string, intervals, hostname), intervals is a sorted dictionary. """ if hosts: board=None pool=None hosts_intervals = get_intervals_for_hosts(t_start, t_end, hosts, board, pool) history = {} pool = multiprocessing.pool.ThreadPool(processes=16) args = [] for hostname,intervals in hosts_intervals.items(): args.append({'t_start': t_start, 't_end': t_end, 'hostname': hostname, 'intervals': intervals}) results = pool.imap_unordered(get_host_history_intervals, args) for hostname, intervals, count in results: history[hostname] = (intervals, count) report = [] for hostname,intervals in history.items(): total_times = calculate_total_times(intervals[0]) stats = get_stats_string( t_start, t_end, total_times, intervals[0], hostname, intervals[1], print_each_interval) report.append((stats, intervals[0], hostname)) return report def get_report_for_host(t_start, t_end, hostname, print_each_interval): """Gets stats report for a host @param t_start: beginning of time period we are interested in. @param t_end: end of time period we are interested in. @param hostname: hostname for the host we are interested in (string) @param print_each_interval: True or False, whether we want to display all intervals @returns: stats report for this particular host (string) """ # Search for status change intervals during given time range. intervals = get_intervals_for_host(t_start, t_end, hostname) num_entries_found = len(intervals) # Update the status change intervals with status before the first entry and # host's lock history. _, intervals_of_statuses = get_host_history_intervals( {'t_start': t_start, 't_end': t_end, 'hostname': hostname, 'intervals': intervals}) total_times = calculate_total_times(intervals_of_statuses) return (get_stats_string( t_start, t_end, total_times, intervals_of_statuses, hostname, num_entries_found, print_each_interval), intervals_of_statuses) def get_stats_string(t_start, t_end, total_times, intervals_of_statuses, hostname, num_entries_found, print_each_interval): """Returns string reporting host_history for this host. @param t_start: beginning of time period we are interested in. @param t_end: end of time period we are interested in. @param total_times: dictionary where key=status, value=(time spent in that status) @param intervals_of_statuses: dictionary where keys is tuple (ti, tf), and value is the status along with other metadata. @param hostname: hostname for the host we are interested in (string) @param num_entries_found: Number of entries found for the host in es @param print_each_interval: boolean, whether to print each interval """ delta = t_end - t_start result = 'usage stats for host: %s \n' % (hostname) result += ' %s - %s \n' % (time_utils.epoch_time_to_date_string(t_start), time_utils.epoch_time_to_date_string(t_end)) result += ' Num entries found in this interval: %s\n' % (num_entries_found) for status, value in total_times.iteritems(): spaces = (15 - len(status)) * ' ' result += ' %s: %s %.2f %%\n' % (status, spaces, 100*value/delta) result += '- -- --- ---- ----- ---- --- -- -\n' if print_each_interval: for interval, status_info in intervals_of_statuses.iteritems(): t0, t1 = interval t0_string = time_utils.epoch_time_to_date_string(t0) t1_string = time_utils.epoch_time_to_date_string(t1) status = status_info['status'] delta = int(t1-t0) id_info = status_info['metadata'].get( 'task_id', status_info['metadata'].get('job_id', '')) result += (' %s : %s %-15s %-10s %ss\n' % (t0_string, t1_string, status, id_info, delta)) return result def calculate_status_times(t_start, t_end, int_status, metadata, locked_intervals): """Returns a list of intervals along w/ statuses associated with them. If the dut is in status Ready, i.e., int_status==Ready, the lock history should be applied so that the time period when dut is locked is considered as not available. Any other status is considered that dut is doing something and being used. `Repair Failed` and Repairing are not checked with lock status, since these two statuses indicate the dut is not available any way. @param t_start: start time @param t_end: end time @param int_status: status of [t_start, t_end] if not locked @param metadata: metadata of the status change, e.g., task_id, task_name. @param locked_intervals: list of tuples denoting intervals of locked states @returns: dictionary where key = (t_interval_start, t_interval_end), val = (status, metadata) t_interval_start: beginning of interval for that status t_interval_end: end of the interval for that status status: string such as 'Repair Failed', 'Locked', etc. metadata: A dictionary of metadata, e.g., {'task_id':123, 'task_name':'Reset'} """ statuses = collections.OrderedDict() prev_interval_end = t_start # TODO: Put allow more information here in info/locked status status_info = {'status': int_status, 'metadata': metadata} locked_info = {'status': 'Locked', 'metadata': {}} if not locked_intervals: statuses[(t_start, t_end)] = status_info return statuses for lock_start, lock_end in locked_intervals: if prev_interval_end >= t_end: break if lock_start > t_end: # optimization to break early # case 0 # Timeline of status change: t_start t_end # Timeline of lock action: lock_start lock_end break elif lock_end < prev_interval_end: # case 1 # prev_interval_end t_end # lock_start lock_end continue elif lock_end <= t_end and lock_start >= prev_interval_end: # case 2 # prev_interval_end t_end # lock_start lock_end # Lock happened in the middle, while the host stays in the same # status, consider the lock has no effect on host history. statuses[(prev_interval_end, lock_end)] = status_info prev_interval_end = lock_end elif lock_end > prev_interval_end and lock_start < prev_interval_end: # case 3 # prev_interval_end t_end # lock_start lock_end (or lock_end) # If the host status changed in the middle of being locked, consider # the new status change as part of the host history. statuses[(prev_interval_end, min(lock_end, t_end))] = locked_info prev_interval_end = lock_end elif lock_start < t_end and lock_end > t_end: # case 4 # prev_interval_end t_end # lock_start lock_end # If the lock happens in the middle of host status change, consider # the lock has no effect on the host history for that status. statuses[(prev_interval_end, t_end)] = status_info statuses[(lock_start, t_end)] = locked_info prev_interval_end = t_end # Otherwise we are in the case where lock_end < t_start OR # lock_start > t_end, which means the lock doesn't apply. if t_end > prev_interval_end: # This is to avoid logging the same time statuses[(prev_interval_end, t_end)] = status_info return statuses def get_log_url(hostname, metadata): """Compile a url to job's debug log from debug string. @param hostname: Hostname of the dut. @param metadata: A dictionary of other metadata, e.g., {'task_id':123, 'task_name':'Reset'} @return: Url of the debug log for special task or job url for test job. """ log_url = None if 'task_id' in metadata and 'task_name' in metadata: log_url = job_history.TASK_URL % {'hostname': hostname, 'task_id': metadata['task_id'], 'task_name': metadata['task_name']} elif 'job_id' in metadata and 'owner' in metadata: log_url = job_history.JOB_URL % {'hostname': hostname, 'job_id': metadata['job_id'], 'owner': metadata['owner']} return log_url def build_history(hostname, status_intervals): """Get host history information from given state intervals. @param hostname: Hostname of the dut. @param status_intervals: A ordered dictionary with key as (t_start, t_end) and value as (status, metadata) status = status of the host. e.g. 'Repair Failed' t_start is the beginning of the interval where the DUT's has that status t_end is the end of the interval where the DUT has that status metadata: A dictionary of other metadata, e.g., {'task_id':123, 'task_name':'Reset'} @return: A list of host history, e.g., [{'status': 'Resetting' 'start_time': '2014-08-07 10:02:16', 'end_time': '2014-08-07 10:03:16', 'log_url': 'http://autotest/reset-546546/debug', 'task_id': 546546}, {'status': 'Running' 'start_time': '2014-08-07 10:03:18', 'end_time': '2014-08-07 10:13:00', 'log_url': 'http://autotest/afe/#tab_id=view_job&object_id=1683', 'job_id': 1683} ] """ history = [] for time_interval, status_info in status_intervals.items(): start_time = time_utils.epoch_time_to_date_string(time_interval[0]) end_time = time_utils.epoch_time_to_date_string(time_interval[1]) interval = {'status': status_info['status'], 'start_time': start_time, 'end_time': end_time} interval['log_url'] = get_log_url(hostname, status_info['metadata']) interval.update(status_info['metadata']) history.append(interval) return history def get_status_intervals(history_details): """Get a list of status interval from history details. This is a reverse method of above build_history. Caller gets the history details from RPC get_host_history, and use this method to get the list of status interval, which can be used to calculate stats from host_history_utils.aggregate_hosts. @param history_details: A dictionary of host history for each host, e.g., {'172.22.33.51': [{'status': 'Resetting' 'start_time': '2014-08-07 10:02:16', 'end_time': '2014-08-07 10:03:16', 'log_url': 'http://autotest/reset-546546/debug', 'task_id': 546546},] } @return: A list of dictionaries where keys are tuple (start_time, end_time), and value is a dictionary containing at least key 'status'. """ status_intervals = [] for host,history in history_details.iteritems(): intervals = collections.OrderedDict() for interval in history: start_time = time_utils.to_epoch_time(interval['start_time']) end_time = time_utils.to_epoch_time(interval['end_time']) metadata = copy.deepcopy(interval) metadata['hostname'] = host intervals[(start_time, end_time)] = {'status': interval['status'], 'metadata': metadata} status_intervals.append(intervals) return status_intervals def get_machine_utilization_rate(stats): """Get machine utilization rate from given stats. @param stats: A dictionary with a status as key and value is the total number of seconds spent on the status. @return: The percentage of time when dut is running test jobs. """ not_utilized_status = ['Repairing', 'Repair Failed', 'Ready', 'Verifying'] excluded_status = ['Locked'] total_time = 0 total_time_not_utilized = 0.0 for status, interval in stats.iteritems(): if status in excluded_status: continue total_time += interval if status in not_utilized_status: total_time_not_utilized += interval if total_time == 0: # All duts are locked, assume MUR is 0% return 0 else: return 1 - total_time_not_utilized/total_time def get_machine_availability_rate(stats): """Get machine availability rate from given stats. @param stats: A dictionary with a status as key and value is the total number of seconds spent on the status. @return: The percentage of time when dut is available to run jobs. """ not_available_status = ['Repairing', 'Repair Failed', 'Verifying'] excluded_status = ['Locked'] total_time = 0 total_time_not_available = 0.0 for status, interval in stats.iteritems(): if status in excluded_status: continue total_time += interval if status in not_available_status: total_time_not_available += interval if total_time == 0: # All duts are locked, assume MAR is 0% return 0 else: return 1 - total_time_not_available/total_time