#!/usr/bin/env python # Copyright 2015 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Calculates statistical hypothesis test for given benchmark results. Evaluate two benchmark results given as Chart JSON files to determine how statistically significantly different they are. This evaluation should be run using Chart JSON files created by one of the available benchmarks in tools/perf/run_benchmark. A "benchmark" (e.g. startup.cold.blank_page) includes several "metrics" (e.g. first_main_frame_load_time). """ from __future__ import print_function import argparse import json import os import sys sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from statistical_analysis import results_stats DEFAULT_SIGNIFICANCE_LEVEL = 0.05 DEFAULT_STATISTICAL_TEST = results_stats.MANN def LoadJsonFromPath(json_path): """Returns a JSON from specified location.""" with open(os.path.abspath(json_path)) as data_file: return json.load(data_file) def PrintOutcomeLine(name, max_name_length, outcome, print_p_value): """Prints a single output line, e.g. 'metric_1 True 0.03'.""" print('{:{}}{}'.format(name, max_name_length + 2, outcome[0]), end='') if print_p_value: print('\t{:.10f}'.format(outcome[1]), end='') print() def PrintTestOutcome(test_outcome_dict, test_name, significance_level, print_p_value): """Prints the given test outcomes to the command line. Will print the p-values for each metric's outcome if |print_p_value| is True and also prints the name of the executed statistical test and the significance level. """ print('Statistical analysis results (True=Performance difference likely)\n' '(Test: {}, Significance Level: {})\n'.format(test_name, significance_level)) max_metric_name_len = max([len(metric_name) for metric_name in test_outcome_dict]) for metric_name, outcome in test_outcome_dict.iteritems(): PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value) def PrintPagesetTestOutcome(test_outcome_dict, test_name, significance_level, print_p_value, print_details): """Prints the given test outcomes to the command line. Prints a summary combining the p-values of the pageset for each metric. Then prints results for each metric/page combination if |print_details| is True. """ print('Statistical analysis results (True=Performance difference likely)\n' '(Test: {}, Significance Level: {})\n'.format(test_name, significance_level)) # Print summarized version at the top. max_metric_name_len = max([len(metric_name) for metric_name in test_outcome_dict]) print('Summary (combined p-values for all pages in pageset):\n') for metric_name, pageset in test_outcome_dict.iteritems(): combined_p_value = results_stats.CombinePValues([p[1] for p in pageset.itervalues()]) outcome = (combined_p_value < significance_level, combined_p_value) PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value) print() if not print_details: return # Print outcome for every metric/page combination. for metric_name, pageset in test_outcome_dict.iteritems(): max_page_name_len = max([len(page_name) for page_name in pageset]) print('{}:'.format(metric_name)) for page_name, page_outcome in pageset.iteritems(): PrintOutcomeLine(page_name, max_page_name_len, page_outcome, print_p_value) print() def main(args=None): """Set up parser and run statistical test on given benchmark results. Set up command line parser and its arguments. Then load Chart JSONs from given paths, run the specified statistical hypothesis test on the results and print the test outcomes. """ if args is None: args = sys.argv[1:] parser = argparse.ArgumentParser(description="""Runs statistical significance tests on two given Chart JSON benchmark results produced by the telemetry benchmarks.""") parser.add_argument(dest='json_paths', nargs=2, help='JSON file location') parser.add_argument('--significance', dest='significance_level', default=DEFAULT_SIGNIFICANCE_LEVEL, type=float, help="""The significance level is the type I error rate, which is the probability of determining that the benchmark results are different although they're not. Default: {}, which is common in statistical hypothesis testing.""".format(DEFAULT_SIGNIFICANCE_LEVEL)) parser.add_argument('--statistical-test', dest='statistical_test', default=DEFAULT_STATISTICAL_TEST, choices=results_stats.ALL_TEST_OPTIONS, help="""Specifies the statistical hypothesis test that is used. Choices are: Mann-Whitney U-test, Kolmogorov-Smirnov, Welch's t-test. Default: Mann-Whitney U-Test.""") parser.add_argument('-p', action='store_true', dest='print_p_value', help="""If the -p flag is set, the output will include the p-value for each metric.""") parser.add_argument('-d', action='store_true', dest='print_details', help="""If the -d flag is set, the output will be more detailed for benchmarks containing pagesets, giving results for every metric/page combination after a summary at the top.""") args = parser.parse_args(args) result_jsons = [LoadJsonFromPath(json_path) for json_path in args.json_paths] if (results_stats.DoesChartJSONContainPageset(result_jsons[0]) and results_stats.DoesChartJSONContainPageset(result_jsons[1])): # Benchmark containing a pageset. result_dict_1, result_dict_2 = ( [results_stats.CreatePagesetBenchmarkResultDict(result_json) for result_json in result_jsons]) test_outcome_dict = results_stats.ArePagesetBenchmarkResultsDifferent( result_dict_1, result_dict_2, args.statistical_test, args.significance_level) PrintPagesetTestOutcome(test_outcome_dict, args.statistical_test, args.significance_level, args.print_p_value, args.print_details) else: # Benchmark not containing a pageset. # (If only one JSON contains a pageset, results_stats raises an error.) result_dict_1, result_dict_2 = ( [results_stats.CreateBenchmarkResultDict(result_json) for result_json in result_jsons]) test_outcome_dict = ( results_stats.AreBenchmarkResultsDifferent(result_dict_1, result_dict_2, args.statistical_test, args.significance_level)) PrintTestOutcome(test_outcome_dict, args.statistical_test, args.significance_level, args.print_p_value) if __name__ == '__main__': sys.exit(main())