Source code for ablator.analysis.main

import logging
from pathlib import Path

import pandas as pd
from joblib import Memory

from ablator.analysis.plot.utils import parse_name_remap
from ablator.main.configs import Optim

logger = logging.getLogger(__name__)


[docs]class Analysis:
    """
    A class for analyzing experimental results.

    Attributes
    ----------
    optim_metrics : dict[str, Optim]
        A dictionary mapping metric names to optimization directions.
    save_dir : str | None
        The directory to save analysis results to.
    cache : Memory | None
        A joblib memory cache for saving results.
    categorical_attributes : list[str]
        The list of all the categorical hyperparameter names
    numerical_attributes : list[str]
        The list of all the numerical hyperparameter names
    experiment_attributes : list[str]
        The list of all the hyperparameter names
    results : pd.DataFrame
        The dataframe extracted from the results file based on given metrics names and hyperparameter names.

    """

[docs]    def __init__(
        self,
        results: pd.DataFrame,
        categorical_attributes: list[str],
        numerical_attributes: list[str],
        optim_metrics: dict[str, Optim],
        save_dir: str | None = None,
        cache=False,
    ) -> None:
        """
        Initialize the Analysis class.

        Parameters
        ----------
        results : pd.DataFrame
            The result dataframe.
        categorical_attributes : list[str]
            The list of all the categorical hyperparameter names
        numerical_attributes : list[str]
            The list of all the numerical hyperparameter names
        optim_metrics : dict[str, Optim]
            A dictionary mapping metric names to optimization directions.
        save_dir : str | None
            The directory to save analysis results to.
        cache : bool
            Whether to cache results.
        """
        self.optim_metrics = optim_metrics
        self.save_dir: Path | None = None
        self.cache: Memory | None = None
        if save_dir is not None:
            self.save_dir = Path(save_dir)
            if not self.save_dir.parent.exists():
                raise FileNotFoundError(
                    f"Save directory does not exist. `{self.save_dir.parent}`"
                )
            self.save_dir.mkdir(exist_ok=True)
            self.cache = Memory(Path(save_dir).joinpath(".cache"), verbose=0)
            if not cache:
                self.cache.clear()
                self.cache = None
        self.categorical_attributes: list[str] = categorical_attributes
        self.numerical_attributes: list[str] = numerical_attributes
        self.experiment_attributes: list[str] = (
            self.categorical_attributes + self.numerical_attributes
        )

        self.results: pd.DataFrame = results[
            self.experiment_attributes
            + list(self.optim_metrics.keys())
            + ["path", "index"]
        ]

    @property
    def metric_names(self):
        return list(self.optim_metrics.keys())

    @classmethod
    def _get_best_results_by_metric(
        cls,
        raw_results: pd.DataFrame,
        metric_map: dict[str, Optim],
    ):
        def _best_perf(row: pd.DataFrame, name, obj_fn):
            if Optim(obj_fn) == Optim.min:
                return row.sort_values(name, na_position="last").iloc[0]
            return row.sort_values(name, na_position="first").iloc[-1]

        _ress = []
        for name, obj_fn in metric_map.items():
            res = (
                raw_results.groupby("path")
                .apply(lambda x, name=name, obj_fn=obj_fn: _best_perf(x, name, obj_fn))
                .reset_index(drop=True)
            )
            res["best"] = name
            _ress.append(res)
        report_results = pd.concat(_ress).reset_index(drop=True)

        return report_results

    @classmethod
    def _remap_results(
        cls,
        attributes: pd.DataFrame,
        metrics: pd.DataFrame,
        metric_map: dict[str, Optim],
        metric_name_remap: dict[str, str] | None = None,
        attribute_name_remap: dict[str, str] | None = None,
    ):
        """
        Remaps attribute and metric names in ``attributes`` and ``metrics`` DataFrames
        based on ``attribute_name_remap`` and ``metric_name_remap``, and updates ``metric_map``
        accordingly.

        Parameters
        ----------
        attributes : pandas.DataFrame
            The DataFrame containing attribute values for each experiment.
        metrics : pandas.DataFrame
            The DataFrame containing metric values for each experiment.
        metric_map : dict of str to Optim
            A dictionary mapping metric names to optimization objectives (minimization or maximization).
        metric_name_remap : dict of str to str or None, optional
            A dictionary mapping input metric names to output metric names.
            If None, the output metric names will be the same as the input metric names.
        attribute_name_remap : dict of str to str or None, optional
            A dictionary mapping input attribute names to output attribute names.
            If None, the output attribute names will be the same as the input attribute names.

        Returns
        -------
        pandas.DataFrame, pandas.DataFrame, dict of str to Optim
            The remapped ``attributes`` DataFrame, the remapped ``metrics`` DataFrame,
            and the updated ``metric_map`` dictionary.

        Examples
        --------
        >>> import pandas as pd
        >>> from enum import Enum
        >>> class Optim(Enum):
        ...     min = "min"
        ...     max = "max"
        ...
        >>> attributes = pd.DataFrame({"color": ["red", "blue"], "size": [10, 20]})
        >>> metrics = pd.DataFrame({"loss": [0.5, 0.4], "accuracy": [0.8, 0.9]})
        >>> metric_map = {"loss": Optim.min, "accuracy": Optim.max}
        >>> metric_name_remap = {"loss": "error", "accuracy": "acc"}
        >>> attribute_name_remap = {"color": "c", "size": "s"}
        >>> remapped_attrs, remapped_metrics, updated_map = Analysis._remap_results(
        ...     attributes, metrics, metric_map,
        ...     metric_name_remap=metric_name_remap,
        ...     attribute_name_remap=attribute_name_remap
        ... )
        >>> assert list(remapped_attrs.columns) == ["c", "s"]
        >>> assert list(remapped_metrics.columns) == ["error", "acc"]
        >>> assert updated_map == {"error": Optim.min, "acc": Optim.max}
        """
        metric_name_remap = parse_name_remap(metrics.columns, metric_name_remap)
        attribute_name_remap = parse_name_remap(
            attributes.columns, attribute_name_remap
        )
        metric_map = {
            metric_name_remap[metric_name]: direction
            for metric_name, direction in metric_map.items()
            if metric_name in metric_name_remap
        }

        attributes = attributes[list(attribute_name_remap.keys())]
        metrics = metrics[list(metric_name_remap.keys())]
        attributes.columns = [attribute_name_remap[c] for c in attributes.columns]
        metrics.columns = [metric_name_remap[c] for c in metrics.columns]
        return attributes, metrics, metric_map