Source code for rwskit.benchmarking

"""
Benchmarking tools.
"""

from __future__ import annotations

# Python Modules
import logging
import time

from inspect import signature
from itertools import product
from typing import Any, Callable, Iterable, Literal, Optional, TypeVar, cast, get_args

from rwskit.collections_ import is_generator, is_iterable

# 3rd Party Modules
import numpy as np
import pandas as pd
import plotnine as pn

from icontract import require
from numpy.typing import ArrayLike
from scipy.stats import ttest_ind
from tqdm.auto import (
    tqdm,
)  # This works better in Jupyter notebooks than 'from tqdm import tqdm'

# Project Modules

log = logging.getLogger(__name__)


__all__ = [
    "TimeUnit",
    "AggregationFunctionName",
    "AggregationFunction",
    "BenchmarkSortValue",
    "BenchmarkResult",
    "BenchmarkRunner",
    "change_time_unit",
    "get_time_unit_abbreviation",
    "validate_call_signature",
]

T = TypeVar("T", bool, int, float, str)
I = TypeVar("I")

[docs] TimeUnit = Literal[ "seconds", "s", "milliseconds", "ms", "microseconds", "us", "µs", "nanoseconds", "ns", ]
""" The supported units of time. """
[docs] AggregationFunctionName = Literal["min", "max", "mean", "median", "sum"]
""" The names of the supported aggregation functions. """
[docs] AggregationFunction = Callable[[ArrayLike], float]
""" An aggregation function is a callable that takes an array-like object and returns a single float. """
[docs] BenchmarkSortValue = Literal["min", "mean", "function"]
""" The supported values for sorting the ``BenchmarkResults`` when represented as a string. """ PlotTheme = Literal[ "theme_538", "theme_bw", "theme_classic", "theme_dark", "theme_gray", "theme_grey", "theme_light", "theme_linedraw", "theme_matplotlib", "theme_minimal", "theme_seaborn", "theme_tufte", "theme_void", "theme_xkcd", ] _time_conversion_factors = { "seconds": 1, "milliseconds": 1e-3, "microseconds": 1e-6, "nanoseconds": 1e-9, } _time_unit_abbreviation_to_name: dict[TimeUnit, TimeUnit] = { "s": "seconds", "ms": "milliseconds", "us": "microseconds", "µs": "microseconds", "ns": "nanoseconds", "seconds": "seconds", "milliseconds": "milliseconds", "microseconds": "microseconds", "nanoseconds": "nanoseconds", } _time_unit_name_to_abbreviation = { "seconds": "s", "milliseconds": "ms", "microseconds": "µs", "nanoseconds": "ns", } _valid_themes = { pn.theme_538, pn.theme_bw, pn.theme_classic, pn.theme_dark, pn.theme_gray, pn.theme_grey, pn.theme_light, pn.theme_linedraw, pn.theme_matplotlib, pn.theme_minimal, pn.theme_seaborn, pn.theme_tufte, pn.theme_void, pn.theme_xkcd, } _theme_name_to_theme = {theme.__name__: theme for theme in _valid_themes} @require( lambda name: name in _time_unit_abbreviation_to_name.keys(), f"Unsupported time unit. Must be one of: {_time_unit_abbreviation_to_name.keys()}", )
[docs] def get_time_unit_abbreviation(name: TimeUnit) -> TimeUnit: """ Get the time unit abbreviation from the given string. Parameters ---------- name : str The name or abbreviation of a supported time unit. Returns ------- str The abbreviation of the time unit specified by ``name``. Raises ------ icontract.errors.ViolationError If the ``name`` is not a supported :data:`TimeUnit`. """ return _time_unit_abbreviation_to_name.get(name, name)
@require(lambda from_unit: from_unit in get_args(TimeUnit), "Invalid time unit") @require(lambda to_unit: to_unit in get_args(TimeUnit), "Invalid time unit")
[docs] def change_time_unit( value: int | float, from_unit: TimeUnit, to_unit: TimeUnit ) -> float: """ Change the unit of time of a given ``value`` currently in the ``from_unit`` unit to a value in the ``to_unit`` unit. Parameters ---------- value : int or float The current time value. from_unit : TimeUnit The unit of the current value. to_unit : TimeUnit The unit to change the value into. Returns ------- float Return the equivalent value in the new time unit ``to_unit`` """ seconds = value * _time_conversion_factors[from_unit] return seconds / _time_conversion_factors[to_unit]
[docs] def validate_call_signature( fn1: Callable[..., Any], fn2: Callable[..., Any], strict: bool = False ) -> bool: """ Check that the two functions take the same parameters. Parameters ---------- fn1 : Callable[..., Any] The first function to compare. fn2 : Callable[..., Any] The second function to compare. strict : bool, default = False If ``True`` the signatures must match exactly, including whether defaults are present and their values. Otherwise, they are considered equal if the number and types of all parameters are the same. Returns ------- bool True if the functions take the same number and type of parameters. """ params1 = signature(fn1).parameters.values() params2 = signature(fn2).parameters.values() if len(params1) != len(params2): return False if strict: return all(p1 == p2 for p1, p2 in zip(params1, params2)) return all( p1.name == p2.name and p1.annotation == p2.annotation for p1, p2 in zip(params1, params2) )
# region BenchmarkRunner Input Validation def _are_setup_fn_and_functions_compatible( benchmark_space: dict[str, list[T]], setup_fn: Optional[Callable], functions: Iterable[Callable], ) -> bool: if setup_fn is None: return True functions = ( list(functions.values()) if isinstance(functions, dict) else list(functions) ) kwargs = {k: v[0] for k, v in benchmark_space.items()} setup_fn_output = setup_fn(**kwargs) if not isinstance(setup_fn_output, dict): return False parameter_names = [p.name for p in signature(functions[0]).parameters.values()] if len(parameter_names) != len(setup_fn_output): return False return all(k in parameter_names for k in setup_fn_output.keys()) def _are_benchmark_space_and_functions_compatible( setup_fn: Optional[Callable], benchmark_space: dict[str, list[T]], functions: Iterable[callable], ) -> bool: if setup_fn is not None: return True functions = ( list(functions.values()) if isinstance(functions, dict) else list(functions) ) if not bool(functions): # We shouldn't get here if the decorators are ordered properly return False parameter_names = [p.name for p in signature(functions[0]).parameters.values()] if len(parameter_names) != len(benchmark_space): return False return all(k in parameter_names for k in benchmark_space) def _is_space_distinct_from_run_label( benchmark_space: dict[str, list[T]], run_label: str ) -> bool: return not any(k == run_label for k in benchmark_space.keys()) def _functions_have_same_call_signatures(functions: Iterable[callable]) -> bool: functions = ( list(functions.values()) if isinstance(functions, dict) else list(functions) ) if len(functions) < 1: return False fn1 = functions[0] return all(validate_call_signature(fn1, fn2) for fn2 in functions[1:]) def _functions_have_proper_type(functions: Iterable[callable]) -> bool: if isinstance(functions, dict): return all(isinstance(k, str) and callable(v) for k, v in functions.items()) return all(callable(f) for f in functions) def _are_benchmark_space_and_setup_fn_compatible( benchmark_space: dict[str, list[T]], setup_fn: Optional[Callable] ) -> bool: # There's nothing to check if setup_fn is None: return True parameter_names = [p.name for p in signature(setup_fn).parameters.values()] # They need the same number of arguments if len(parameter_names) != len(benchmark_space): return False return all(k in parameter_names for k in benchmark_space) def _benchmark_space_has_valid_keys(benchmark_space: dict[str, list[T]]) -> bool: return not any(k in ("min", "max", "mean", "std") for k in benchmark_space) def _benchmark_space_is_dict_of_lists(benchmark_space: dict[str, list[T]]) -> bool: is_dict_of_lists = all( isinstance(k, str) and isinstance(v, list) for k, v in benchmark_space.items() ) is_non_empty = bool(benchmark_space) and any( bool(v) for v in benchmark_space.values() ) has_valid_values = all( isinstance(v, (bool, int, float, str)) for lov in benchmark_space.values() for v in lov ) return is_dict_of_lists and is_non_empty and has_valid_values def _is_valid_float_format(float_fmt: str) -> bool: try: format(123.45, float_fmt) except (ValueError, TypeError): return False else: return True # endregion BenchmarkRunner Input Validation
[docs] class BenchmarkRunner: """ A class for comparing the execution time of multiple functions """ _aggregate_function_lookup = { "min": np.min, "max": np.max, "mean": np.mean, "median": np.median, "sum": np.sum, } # Decorators are evaluated from bottom to top. So more complex contracts, # especially ones that depend on previous checks being performed, must be # defined first even though it is more intuitive to have the contracts # provided in the same order as the __init__ arguments. @require( _are_setup_fn_and_functions_compatible, "'setup_fn' should return a dict suitable to use as **kwargs for the test functions.", ) @require( _are_benchmark_space_and_functions_compatible, "The 'benchmark_space' keys must match the 'functions' params when 'setup_fn' is None.", ) @require( _is_space_distinct_from_run_label, "The 'benchmark_space' can't have a key that is equal to the 'run_label'.", ) @require( _functions_have_same_call_signatures, "All the functions must have the same call signature.", ) @require( _functions_have_proper_type, "'functions' must be a dict[str, callable] or an iterable of callables.", ) @require(lambda functions: bool(list(functions)), "'functions' must be non-empty.") @require( lambda functions: not is_generator(functions), "'functions' cannot be a generator.", ) @require(lambda functions: is_iterable(functions), "'functions' must be iterable.") @require( _are_benchmark_space_and_setup_fn_compatible, "The 'benchmark_space' keys must match the 'setup_fn' params if 'setup_fn' is given.", ) @require( lambda setup_fn: callable(setup_fn) if setup_fn is not None else True, "'setup_fn' must be callable.", ) @require( _benchmark_space_has_valid_keys, f"'benchmark_space' cannot contain the keys: [min, max, mean, std]", ) @require( _benchmark_space_is_dict_of_lists, "'benchmark_space' must be a non-empty dict[str, list[bool, int, float, str]].", ) @require(lambda n_runs: n_runs > 0, "There must be at least one run.") @require(lambda n_tests: n_tests > 0, "There must be at least one test") @require(lambda n_warm_ups: n_warm_ups >= 0, "There can't be negative warm ups.") @require( lambda time_unit: time_unit in get_args(TimeUnit), f"Invalid 'time_unit' must be one of: {get_args(TimeUnit)}", ) @require( lambda test_agg_fn: test_agg_fn in get_args(AggregationFunctionName), f"Invalid 'test_agg_fn' function name. It must be one of: {get_args(AggregationFunctionName)}", ) @require(_is_valid_float_format, "Invalid 'float_fmt'") @require( lambda sort_by: sort_by in get_args(BenchmarkSortValue), f"Invalid 'sort_by' value. It must be one of: {get_args(BenchmarkSortValue)}.", ) def __init__( self, functions: Iterable[Callable] | dict[str, Callable], benchmark_space: dict[str, list[T]], setup_fn: Optional[Callable] = None, use_single_setup: bool = True, n_runs: int = 10, n_tests: int = 2, n_warm_ups: int = 1, time_unit: TimeUnit = cast(TimeUnit, "s"), test_agg_fn: AggregationFunctionName = "min", run_label: str = "run", show_progress: bool = False, verbose: bool = True, float_fmt: str = "0.4e", sort_by: BenchmarkSortValue = "min", test_significance: bool = True, ): """ A class for profiling a set of functions based on one or mor criteria. The high level view of the benchmarking process is as follows. For every combination of parameters in the ``benchmark_space`` a sub-benchmark will be run. There are 2 nested execution loops for each sub-benchmark. The innermost loop runs each function on the current data ``n_tests`` number of times and aggregates the results using the ``test_agg_fn``. The same data is always used for this loop no matter what. The outer loop will run this process ``n_runs`` times. If ``use_single_setup`` is ``True`` then the setup function will only be called once and will be used for all the runs. If it is ``False`` the setup function will be called for every run. The execution times for all runs will be stored in a Pandas DataFrame that can be retrieved after calling the benchmark. .. note:: ``min``, ``max``, ``mean``, ``std`` cannot be used as keys in the ``benchmark_space``. .. note:: ``functions`` must be an iterable, but cannot be a generator. .. note:: The first parameter in the ``benchmark_space`` is always used as the x-axis for :meth:`BenchmarkResult.plot`. Parameters ---------- functions : list[BenchmarkFunction] A list of functions to benchmark or a dictionary that maps a label to a benchmark function. benchmark_space : dict[str, list[T]] The space of values to benchmark over. A benchmark will be executed for each combination of values obtained from the dictionary. The combinations are formed by taking the Cartesian product taking one value from each list in the dictionary. The names of the keys of this dictionary must either be the names of keyword arguments of the ``setup_fn``, or keyword arguments of the benchmark functions if no ``setup_fn`` is provided. Only ``bool``, ``int``, ``float``, and ``str`` values are supported. setup_fn : SetupFunction A function that initializes data to be passed to the benchmark ``functions``. If ``None``, the values from ``setup_args`` will be passed directly to each function in ``functions``. use_single_setup : bool, default = True For functions that are guaranteed to be deterministic no matter what the input is, this should be ``True``. However, if the function is non-deterministic or the performance might depend on how the data is initialized, this should be ``False``. n_runs : int The number of execution tests to run. n_tests : int The number times to run each function in a single test. n_warm_ups : int The number of tests to run before recording the timing data. test_agg_fn : {'min', 'max', 'mean', 'median', 'sum'} The function to use for aggregating individual test results within a run. run_label : str The column label in the resulting Pandas ``DataFrame`` that indicates the run number for the given execution times. show_progress : bool = False Show progress bars while running the benchmark. verbose : bool, default = True Print the full results and summary statistics to ``stdout`` when complete. float_fmt : str The format used to print floating point values to a string. sort_by : str {min, mean, function} When ``verbose=True`` this will determine how the results are sorted (either by the min run time, max run time or by the function name). test_significance: bool, default = False If ``True``, test if the difference in run times are different between all pairs of models. Notes ----- **Deterministic Function and Deterministic Data** If your algorithm is deterministic and is not influenced at all by the content of the data, only its size, then I would suggest the following parameters: * ``use_single_setup = True``: Use the same data for all the runs on the current setup parameters. * ``n_runs > 1``: Run it at least a few times per parameter set to make sure there weren't any anomalies biasing the results. * ``n_tests = 1``: You should not need to run multiple tests here. **Deterministic Function and Non-Deterministic Data** If your function is deterministic (the sequence of execution is always the same), but could be influenced by the content of the data I would suggest the following parameters: * ``setup_fn != None``: The setup function should return different data each run (of the same size) * ``use_single_setup = False``: Run the setup function to generate new data on each run. * ``n_tests > 1``: Run the function on the same data a few times in case there was an anomaly, which could bias the result. * ``n_runs > 1``: Run the function on multiple different data sets to estimate how much variability is expected due to the makeup of the data. * ``test_agg_fn = 'min'``: Since the function should execute the same way on the same data, the `min` should be the most informative. **Non-Deterministic Function** If the function itself is non-deterministic you probably want something similar to the deterministic case with non-deterministic data. In this case however, it is probably pointless to set ``n_tests > 1`` and you should just increase ``n_runs`` to get better overall estimates. Examples -------- .. code-block:: python >>> import time >>> sort_setup_fn = ( ... lambda array_size, dtype, unique_values: ... np.random.randint(unique_values, size=array_size).astype(dtype) ... ) >>> b = BenchmarkRunner(functions={"fn1": lambda a: time.sleep(0.01), ... "fn2": lambda a: time.sleep(0.02)}, ... benchmark_space={"array_size": [100, 10000], ... "dtype": ["U", "int"], ... "unique_values": [10, 100, 1000]}, ... setup_fn=sort_setup_fn ... time_unit="ms" ... float_fmt="0.3f") >>> b() function array_size unique_values min mean std -------------------------------------------------------- fn1 100 10 1.040 1.053 0.008 fn2 100 10 5.057 5.058 0.001 -------------------------------------------------------- fn1 100 100 1.053 1.056 0.002 fn2 100 100 5.057 5.058 0.001 -------------------------------------------------------- fn1 100 1000 1.056 1.057 0.000 fn2 100 1000 5.058 5.058 0.000 -------------------------------------------------------- fn1 10000 10 1.056 1.057 0.000 fn2 10000 10 5.058 5.059 0.000 -------------------------------------------------------- fn1 10000 100 1.056 1.057 0.000 fn2 10000 100 5.058 5.064 0.009 -------------------------------------------------------- fn1 10000 1000 1.056 1.057 0.001 fn2 10000 1000 5.063 5.066 0.002 """ self._check_for_configuration_problems(use_single_setup, setup_fn, n_runs)
[docs] self.functions = self._normalize_functions(functions)
[docs] self.benchmark_space = benchmark_space
[docs] self.setup_fn = setup_fn or self._passthrough_setup_args
[docs] self.use_single_setup = use_single_setup
[docs] self.n_runs = n_runs
[docs] self.n_tests = n_tests
[docs] self.n_warm_ups = n_warm_ups
[docs] self.test_agg_fn = self._aggregate_function_lookup[test_agg_fn]
[docs] self.run_label = run_label
[docs] self.show_progress = show_progress
[docs] self.verbose = verbose
[docs] self.time_unit: TimeUnit = cast(TimeUnit, get_time_unit_abbreviation(time_unit))
[docs] self.float_fmt = float_fmt
[docs] self.sort_by = sort_by
[docs] self.test_significance = test_significance
[docs] def __call__(self) -> BenchmarkResult: """Runs the benchmark. Returns ------- BenchmarkResult """ return self.run()
[docs] def run(self) -> BenchmarkResult: """Runs the benchmark. Returns ------- BenchmarkResult """ # Create the DataFrame to store the results results = self._initialize_data_frame() # Create a generator to iterate through all possible combinations of # the benchmark space. parameter_values = product(*self.benchmark_space.values()) # Set up the parameter space loop with optional progress indicators parameter_progress = self._progress( parameter_values, total=np.prod([len(x) for x in self.benchmark_space.values()]), position=0, unit="parameters", ) # Loop through all combinations for parameter_values in parameter_progress: # Create the setup_fn kwargs setup_kwargs = { k: v for k, v in zip(self.benchmark_space.keys(), parameter_values) } # Get the function arguments here if we only want to generate # them once for all runs of this parameter set (i.e., pure deterministic functions). function_args = ( self.setup_fn(**setup_kwargs) if self.use_single_setup else None ) # Set up the run loop with optional progress indicators run_progress = self._progress( range(1, self.n_runs + 1), position=1, leave=False, unit="runs" ) for run_num in run_progress: self._do_run(results, run_num, setup_kwargs, function_args) significance_results = ( self._run_ttest(results) if self.test_significance else None ) result = BenchmarkResult( results, significance_results, self.benchmark_space, self.float_fmt, self.sort_by, self.run_label, self.time_unit, ) if self.verbose: print(result) return result
@staticmethod def _check_for_configuration_problems( use_single_setup: bool, setup_fn: Optional[Callable], n_runs: int ): if not use_single_setup and not setup_fn: log.warning( f"Setting 'single_setup' to 'False' implies the performance of the functions " f"may be non-deterministic. However, no 'setup_fn' was defined so be sure that " f"the only non-determinism is coming from the functions themselves." ) if not use_single_setup and n_runs == 1: log.warning( f"Setting 'single_setup' to 'False' implies the performance of the functions " f"may be non-deterministic. However, 'n_runs' is set to 1, which will not." ) @staticmethod def _normalize_functions( functions: list[Callable] | dict[str, Callable] ) -> dict[str, Callable]: return ( functions if isinstance(functions, dict) else {f.__name__: f for f in functions} ) @staticmethod def _passthrough_setup_args(**kwargs: T) -> dict[str, T]: return kwargs @staticmethod def _identity(x: Any) -> Any: return x def _set_default_run_agg_fn(self) -> AggregationFunction: if self.use_single_setup: return self._aggregate_function_lookup["min"] return self._aggregate_function_lookup["mean"] def _progress(self, values: Iterable[I], **tqdm_kwargs): return tqdm(values, disable=not self.show_progress, **tqdm_kwargs) def _initialize_data_frame(self) -> pd.DataFrame: index_map = { self.run_label: list(range(1, self.n_runs + 1)) } | self.benchmark_space multi_index = pd.MultiIndex.from_product( list(index_map.values()), names=index_map.keys() ) return pd.DataFrame( index=multi_index, columns=list(self.functions.keys()), dtype=np.float64 ) def _do_run( self, results: pd.DataFrame, run_num: int, setup_args: dict[str, T], function_args: Optional[dict[str, T]], ): # If we didn't set the function arguments already (i.e., use_single_setup=False) # we need to set them here. function_args = function_args or self.setup_fn(**setup_args) for function_name, function in self.functions.items(): row_index = (run_num,) + tuple(setup_args.values()) column_name = function_name results.loc[row_index, column_name] = self._do_test(function, function_args) def _do_test(self, function: Callable, function_args: dict[str, Any]) -> float: test_times = [ self._time_function(function, function_args) for _ in range(self.n_warm_ups + self.n_tests) ] return self.test_agg_fn(test_times[self.n_warm_ups :]) def _time_function( self, function: Callable, function_args: list[Any] | dict[str, Any] ) -> float: start = time.perf_counter_ns() function(**function_args) return change_time_unit( time.perf_counter_ns() - start, "nanoseconds", self.time_unit ) def _run_ttest(self, df: pd.DataFrame) -> pd.DataFrame: benchmark_space = list(self.benchmark_space) functions = list(self.functions.keys()) def _pairwise_ttest(group): tt_df = pd.DataFrame( { f1: {f2: ttest_ind(group[f1], group[f2]).pvalue for f2 in functions} for f1 in functions } ) tt_df.index.name = "function" return tt_df return df.groupby(benchmark_space).apply(_pairwise_ttest).reset_index()
[docs] class BenchmarkResult: """A class for managing the results output by a :class:`BenchmarkRunner`.""" def __init__( self, results: pd.DataFrame, significance_results: Optional[pd.DataFrame], benchmark_space: dict[str, list[T]], float_fmt: str = "0.4e", sort_by: BenchmarkSortValue = "min", run_label: str = "run", time_unit: TimeUnit = "s", ): """A class for managing the results output by a :class:`BenchmarkRunner`. .. note:: This class is not intended to be instantiated directly. Parameters ---------- results : DataFrame The results DataFrame obtained by a :class:`BenchmarkRunner`. significance_results : DataFrame Pairwise t-test results. benchmark_space : dict[string, list[T]] The parameters used to benchmark the functions. float_fmt : str A valid format string to use for floating point numbers. sort_by : str {min, mean} The summary statistic to sort the results by when represented as a string. run_label : str The label used to indicate the run number. time_unit : TimeUnit The original time unit used to benchmark the results. """ self._results = results self._significance_results = significance_results self._benchmark_space = benchmark_space self._float_fmt = float_fmt self._sort_by = sort_by self._run_label = run_label self._time_unit = time_unit self._function_names = set(results.columns)
[docs] def __repr__(self) -> str: """The full pandas :class:`pandas.DataFrame` containing all the runs as a string. Returns ------- str The full benchmark results as a string. """ return self._results.to_string()
[docs] def __str__(self) -> str: """Returns a table of the summary statistics of the benchmark results as a string. Returns ------- str The summary statistics of the benchmark results as a string. """ summary = self.summary() groups = summary.groupby(list(self.benchmark_space)) widths = self._get_max_column_widths(summary) header = self._stringify_header(summary.columns, widths) header_size = len(header) lines = ["=" * header_size] lines += [header] for i, (key, group) in enumerate(groups): separator = "=" if i == 0 else "-" lines.append(separator * header_size) lines.extend(self._stringify_group(group, widths)) return "\n".join(lines)
@property
[docs] def benchmark_space(self) -> dict[str, list[T]]: """Return the parameters used to benchmark the functions. Returns ------- dict[str, list[T]] The parameters used to produce these results. """ return self._benchmark_space.copy()
@require( lambda as_time_unit: ( as_time_unit in get_args(TimeUnit) if as_time_unit else True ) )
[docs] def results( self, wide: bool = True, as_time_unit: Optional[TimeUnit] = None ) -> pd.DataFrame: """Get a pandas ``DataFrame`` containing the results. Parameters ---------- as_time_unit : TimeUnit, optional Return the results in this time unit instead of the one used during the benchmark. wide : bool, default = True If ``True`` return the results in the default wide format, which is easier to read. Otherwise, return the results in long format, which can be easier to use for plotting. Returns ------- DataFrame The DataFrame containing the results, either in wide or long format. """ results = self._results.copy(deep=True) if as_time_unit is not None: results = results.map( lambda t: change_time_unit(t, self._time_unit, as_time_unit) ) # noqa if wide: return results return pd.melt( results.reset_index(), id_vars=[self._run_label] + list(self.benchmark_space.keys()), var_name="function", value_name="time", )
@property
[docs] def significance_results(self) -> pd.DataFrame: """Return the results of the significance tests as a :class:`pandas.DataFrame`. Returns ------- DataFrame The results of the significance tests as a :class:`pandas.DataFrame`. """ return self._significance_results.copy(deep=True)
@require( lambda as_time_unit: ( as_time_unit in get_args(TimeUnit) if as_time_unit else True ) )
[docs] def summary( self, wide: bool = True, as_time_unit: Optional[TimeUnit] = None ) -> pd.DataFrame: """The summary statistics of the benchmark results. Parameters ---------- wide : bool, default = True If ``True``, return the results in wide format, otherwise, return the results in long format. as_time_unit : TimeUnit, optional Return the results in this time unit instead of the one used during the benchmark. Returns ------- pd.DataFrame The DataFrame with the summary statistics. """ long_results = self.results(wide=False, as_time_unit=as_time_unit) group_by = ["function"] + list(self.benchmark_space) summary = long_results.groupby(group_by).agg( {"time": ["min", "max", "mean", "std"]} ) summary.columns = summary.columns = [c2 or c1 for c1, c2 in summary.columns] summary.reset_index(inplace=True) summary = self._add_significance_to_summary(summary) if wide: return summary return summary.melt( id_vars=["function"] + list(self.benchmark_space), var_name="agg", value_name="value", )
# noinspection PyTypeChecker @require( lambda self, functions: ( all(f in self._function_names for f in functions) if functions else True ), "At least one of the functions provided is not in the benchmark results.", )
[docs] def plot( self, x_label: str = None, use_stat: Literal["min", "mean"] = "min", functions: Iterable[str] = None, show_points: bool = False, show_ribbon: bool = False, free_y: bool = False, theme_name: PlotTheme = None, figure_size: Optional[tuple[int, int]] = None, as_time_unit: Optional[TimeUnit] = None, ) -> pn.ggplot: """Create and return a `ggplot <https://plotnine.org/reference/ggplot.html#plotnine.ggplot>`__ object that visualizes the benchmark results. .. note:: Use ``show` or ``save``` on the resulting object to render or save it. Parameters ---------- x_label : str, optional Label the `x-axis` with this value, if given. Otherwise, the first key in the ``benchmark_space`` will be used. use_stat : str {min, mean} Which stat to plot. functions: Iterable[str], optional If defined, limit the plot to these functions. show_points : bool, default = False Show each individual run as a point on the graph. show_ribbon : bool, default = False If ``True``, include a ribbon that encompasses the minimum and maximum value over all the runs in a group. free_y : bool, default = False If ``True`` and the ``benchmark_space`` includes more than 1 parameter, the y-axis is not constrained to be the same for each resulting plot. theme_name : PlotTheme, optional The name of a theme to style your plot with, otherwise, it will use the default theme. figure_size : tuple[int, int], optional, Override the size of the resulting figure. If not specified and there are more than one ``benchmark_space`` parameters a heuristic is used to try to ensure each subgraph will be legible. as_time_unit : TimeUnit, optional Display the results in this time unit instead of the one used during the benchmarking. Returns ------- plotnine.ggplot The plot object. """ results = self.results(wide=False, as_time_unit=as_time_unit) summary = self.summary(as_time_unit=as_time_unit) summary["lb"] = summary["mean"] - summary["std"] summary["ub"] = summary["mean"] + summary["std"] if functions is not None: results = results[results["function"].isin(functions)] summary = summary[summary["function"].isin(functions)] search_space_names = list(self._benchmark_space.keys()) x = search_space_names[0] theme = _theme_name_to_theme.get(theme_name) plot = pn.ggplot(summary) if show_points: plot += pn.geom_point( data=results, mapping=pn.aes(x=x, y="time", color="function") ) plot += pn.geom_line(mapping=pn.aes(x=x, y=use_stat, color="function")) if show_ribbon: plot += pn.geom_ribbon( mapping=pn.aes( x=x, ymin="lb", ymax="ub", fill="function", ), inherit_aes=False, alpha=0.60, ) plot += pn.geom_ribbon( mapping=pn.aes( x=x, ymin="min", ymax="max", fill="function", ), inherit_aes=False, alpha=0.20, ) if x_label is not None: plot += pn.xlab(x_label) plot += pn.ylab(f"Time [{_time_unit_name_to_abbreviation[self._time_unit]}]") scales = "free_y" if free_y else None if len(search_space_names) == 2: plot += pn.facet_grid( rows=search_space_names[1], scales=scales, labeller="label_both" ) elif len(search_space_names) == 3 and not free_y: plot += pn.facet_grid( rows=search_space_names[1], cols=search_space_names[2], labeller="label_both", scales=scales, ) elif len(search_space_names) >= 3: ncol = len(self.benchmark_space[search_space_names[1]]) plot += pn.facet_wrap( search_space_names[1:], ncol=ncol, scales=scales, labeller="label_both", dir="v", ) if theme is not None: plot += theme() plot += pn.theme( figure_size=self._estimate_figure_size(figure_size), ) return plot
def _estimate_figure_size( self, figure_size: Optional[tuple[int, int]] ) -> Optional[tuple[int, int]]: if figure_size is not None: return figure_size if len(self._benchmark_space) < 3: return None search_space_values = list(self._benchmark_space.values())[1:] total_plots = np.prod([len(v) for v in search_space_values]) sub_plot_width, sub_plot_height = 5, 4 n_cols = len(search_space_values[0]) n_rows = np.ceil(total_plots / n_cols) return sub_plot_width * n_cols, sub_plot_height * n_rows def _add_significance_to_summary(self, df: pd.DataFrame) -> pd.DataFrame: if self._significance_results is None: return df benchmark_space = list(self.benchmark_space) def _make_condition(g: pd.DataFrame, col: str) -> str: value = g[col].iloc[0] return ( f"{col} == '{value}'" if isinstance(value, str) else f"{col} == {value}" ) def _function(group: pd.DataFrame): group = group.sort_values("mean") base_conditions = [_make_condition(group, col) for col in benchmark_space] p_values = [] for i in range(len(group.index) - 1): fn1 = group.iloc[i, 0] fn2 = group.iloc[i + 1, 0] group_conditions = base_conditions + [f"function == '{fn1}'"] query_expr = " and ".join(group_conditions) query_row = self._significance_results.query(query_expr) p_values.append(query_row[fn2].iloc[0]) p_values.append(np.nan) group["p_value"] = p_values return group groups = df.groupby(benchmark_space) result = groups.apply(_function, include_groups=True).reset_index(drop=True) columns = ( ["function", "p_value"] + benchmark_space + ["max", "min", "mean", "std"] ) result = result[columns] return result # return df def _get_max_column_widths(self, df: pd.DataFrame) -> dict[str, int]: header_widths = {h: len(h) for h in df.columns} data_widths = df.map(lambda x: len(self._stringify_value(x))).max().to_dict() return {h: max(hw, data_widths[h]) for h, hw in header_widths.items()} def _stringify_value(self, value: Any, width: Optional[int] = None) -> str: if isinstance(value, (float, np.floating)): raw = "-" if np.isnan(value) else "{:{}}".format(value, self._float_fmt) return self._stringify_value(raw, width) if width else raw return "{:>{}}".format(str(value), width) if width else str(value) def _stringify_header(self, columns: Iterable[str], widths: dict[str, int]) -> str: return " ".join(["{:>{}}".format(h, f"{widths[h]}s") for h in columns]) def _stringify_group( self, group: pd.DataFrame, widths: dict[str, int] ) -> list[str]: sorted_group = group.sort_values(by=self._sort_by) return [self._stringify_row(row, widths) for _, row in sorted_group.iterrows()] def _stringify_row(self, row: pd.Series, widths: dict[str, int]) -> str: return " ".join( self._stringify_value(v, widths[str(i)]) for i, v in row.items() )