Source code for lightautoml.addons.utilization.utilization

"""Tools to configure time utilization."""

import logging

from copy import deepcopy
from typing import Any
from typing import Iterable
from typing import Optional
from typing import Sequence
from typing import Type
from typing import Union

from ...automl.base import AutoML
from ...automl.blend import BestModelSelector
from ...automl.blend import Blender
from ...automl.presets.base import AutoMLPreset
from ...dataset.base import LAMLDataset
from ...dataset.utils import concatenate
from ...ml_algo.base import MLAlgo
from ...pipelines.ml.base import MLPipeline
from ...tasks import Task
from ...utils.logging import set_stdout_level
from ...utils.logging import verbosity_to_loglevel
from ...utils.timer import PipelineTimer

from joblib import dump


logger = logging.getLogger(__name__)


class MLAlgoForAutoMLWrapper(MLAlgo):
    """Wrapper to apply blender to list of automl's."""

    @classmethod
    def from_automls(cls, automl: Union[AutoML, Sequence[AutoML]]):
        """Constructs automls.

        Args:
            automl: One AutoML or list of AutoML objects.

        Returns:
            MLAlgo.

        """
        ml_algo = cls()
        ml_algo.models.append(automl)

        return ml_algo

    def fit_predict(self, *args, **kwargs) -> LAMLDataset:
        raise NotImplementedError

    def predict(self, *args, **kwargs) -> LAMLDataset:
        raise NotImplementedError


class MLPipeForAutoMLWrapper(MLPipeline):
    """Wrapper to apply blender to list of automls."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.ml_algos = self._ml_algos

    @classmethod
    def from_automl(cls, automl: AutoML):
        ml_pipe = cls([MLAlgoForAutoMLWrapper.from_automls(automl)])

        return ml_pipe

    @classmethod
    def from_blended(cls, automls: Sequence[AutoML], blender: Blender):
        ml_pipe = cls(
            [
                MLAlgoForAutoMLWrapper.from_automls(automls),
            ]
        )
        ml_pipe.blender = blender

        return ml_pipe



[docs]
class TimeUtilization:
    """Class that helps to utilize given time to :class:`~lightautoml.automl.presets.base.AutoMLPreset`.

    Useful to calc benchmarks and compete
    It takes a list of config files as input and runs it until a time limit is exceeded.
    If time left - it can perform multistart on same configs with new random state.
    In best case - blend different configurations of single preset.
    In worst case - averaging multiple automl's with different states.

    Note:
        Basic usage.

        >>> ensembled_automl = TimeUtilization(TabularAutoML, Task('binary'),
        >>>     timeout=3600, configs_list=['cfg0.yml', 'cfg1.yml'])

        Then ``.fit_predict`` and predict can be
        called like usual :class:`~lightautoml.automl.base.AutoML` class.


    Args:
        automl_factory: One of presets.
        task: Task to solve.
        timeout: Timeout in seconds.
        memory_limit: Memory limit that are passed to each automl.
        cpu_limit: Cpu limit that that are passed to each automl.
        gpu_ids: Gpu_ids that are passed to each automl.
        verbose: Controls the verbosity: the higher, the more messages.
            <1  : messages are not displayed;
            >=1 : the computation process for layers is displayed;
            >=2 : the information about folds processing is also displayed;
            >=3 : the hyperparameters optimization process is also displayed;
            >=4 : the training process for every algorithm is displayed;
        timing_params: Timing_params level that are passed to each automl.
        configs_list: List of str path to configs files.
        inner_blend: Blender instance to blend automl's with same configs
            and different random state.
        outer_blend: Blender instance to blend averaged by random_state
            automl's with different configs.
        drop_last: Usually last automl will be stopped with timeout.
            Flag that defines if we should drop it from ensemble
        return_all_predictions: Skip blend and return all model predictions
        max_runs_per_config: Maximum number of multistart loops.
        random_state_keys: Params of config that used as
            random state with initial values. If ``None`` - search for
            `random_state` key in default config of preset.
            If not found - assume, that seeds are not fixed
            and each run is random by default. For example
            ``{'reader_params': {'random_state': 42}, 'gbm_params': {'default_params': {'seed': 42}}}``
        random_state: initial random seed, that will be
            set in case of search in config.
        **kwargs: Additional params.

    """

    def __init__(
        self,
        automl_factory: Type[AutoMLPreset],
        task: Task,
        timeout: int = 3600,
        memory_limit: int = 16,
        cpu_limit: int = 4,
        gpu_ids: Optional[str] = None,
        timing_params: Optional[dict] = None,
        configs_list: Optional[Sequence[str]] = None,
        inner_blend: Optional[Blender] = None,
        outer_blend: Optional[Blender] = None,
        drop_last: bool = True,
        return_all_predictions: bool = False,
        max_runs_per_config: int = 5,
        random_state_keys: Optional[dict] = None,
        random_state: int = 42,
        **kwargs,
    ):
        self.automl_factory = automl_factory
        self.task = task
        self.timeout = timeout
        self.memory_limit = memory_limit
        self.cpu_limit = cpu_limit
        self.gpu_ids = gpu_ids

        self.timing_params = timing_params
        if timing_params is None:
            self.timing_params = {}

        self.configs_list = configs_list
        if configs_list is None:
            self.configs_list = [None]

        self.max_runs_per_config = max_runs_per_config

        self.random_state_keys = random_state_keys
        if random_state_keys is None:
            self.random_state_keys = self._search_for_states(automl_factory, random_state)

        self.inner_blend = inner_blend
        if inner_blend is None:
            self.inner_blend = BestModelSelector()

        self.outer_blend = outer_blend
        if outer_blend is None:
            self.outer_blend = BestModelSelector()
        self.drop_last = drop_last
        self.return_all_predictions = return_all_predictions
        self.kwargs = kwargs

    def _search_for_key(self, config, key, value: int = 42) -> dict:

        d = {}

        if key in config:
            d[key] = value

        for k in config:
            if type(config[k]) is dict:
                s = self._search_for_key(config[k], key, value)
                if len(s) > 0:
                    d[k] = s
        return d

    def _search_for_states(self, automl_factory: Type[AutoMLPreset], random_state: int = 42) -> dict:

        config = automl_factory.get_config()
        random_states = self._search_for_key(config, "random_state", random_state)

        return random_states

    def _get_upd_states(self, random_state_keys: dict, upd_value: int = 0) -> dict:

        d = {}

        for k in random_state_keys:
            if type(random_state_keys[k]) is dict:
                d[k] = self._get_upd_states(random_state_keys[k], upd_value)
            else:
                d[k] = random_state_keys[k] + upd_value

        return d


[docs]
    def fit_predict(
        self,
        train_data: Any,
        roles: dict,
        train_features: Optional[Sequence[str]] = None,
        cv_iter: Optional[Iterable] = None,
        valid_data: Optional[Any] = None,
        valid_features: Optional[Sequence[str]] = None,
        verbose: int = 0,
        log_file: str = None,
        path_to_save: Optional[str] = None,
    ) -> LAMLDataset:
        """Fit and get prediction on validation dataset.

        Almost same as :meth:`lightautoml.automl.base.AutoML.fit_predict`.

        Additional features - working with different data formats.
        Supported now:

            - Path to ``.csv``, ``.parquet``, ``.feather`` files.
            - :class:`~numpy.ndarray`, or dict of :class:`~numpy.ndarray`.
              For example, ``{'data': X...}``. In this case,
              roles are optional, but `train_features`
              and `valid_features` required.
            - :class:`pandas.DataFrame`.

        Args:
            train_data: Dataset to train.
            roles: Roles dict.
            train_features: Optional features names, if can't
                be inferred from `train_data`.
            cv_iter: Custom cv-iterator. For example,
                :class:`~lightautoml.validation.np_iterators.TimeSeriesIterator`.
            valid_data: Optional validation dataset.
            valid_features: Optional validation dataset features
                if cannot be inferred from `valid_data`.
            verbose: Verbose.
            log_file: Log filename.
            path_to_save: The path that joblib will use to save the model after fit stage is completed. Use *.joblib format.

        Returns:
            Dataset with predictions. Call ``.data`` to get predictions array.

        """
        set_stdout_level(verbosity_to_loglevel(verbose))

        logger.info("Start automl \x1b[1mutilizator\x1b[0m with listed constraints:")
        logger.info(f"- time: {self.timeout:.2f} seconds")
        logger.info(f"- CPU: {self.cpu_limit} cores")
        logger.info(f"- memory: {self.memory_limit} GB\n")
        logger.info("\x1b[1mIf one preset completes earlier, next preset configuration will be started\x1b[0m\n")

        timer = PipelineTimer(self.timeout, **self.timing_params).start()
        history = []

        amls = [[] for _ in range(len(self.configs_list))]
        aml_preds = [[] for _ in range(len(self.configs_list))]
        class_mapping = None
        n_ms = 0
        n_cfg = 0
        upd_state_val = 0
        flg_continute = True
        # train automls one by one while timer is ok
        while flg_continute:
            n_ms += 1

            logger.info("=" * 50)

            for n_cfg, config in enumerate(self.configs_list):
                random_states = self._get_upd_states(self.random_state_keys, upd_state_val)
                random_states["general_params"] = {"return_all_predictions": False}
                upd_state_val += 1

                logger.info(f"Start {n_cfg} automl preset configuration:")
                logger.info("\x1b[1m{}\x1b[0m, random state: {}".format(config.split("/")[-1], random_states))

                cur_kwargs = self.kwargs.copy()
                for k in random_states.keys():
                    if k in self.kwargs:
                        logger.info3("Found {} in kwargs, need to combine".format(k))
                        random_states[k] = {**cur_kwargs[k], **random_states[k]}
                        del cur_kwargs[k]
                        logger.info3("Merged variant for {} = {}".format(k, random_states[k]))

                automl = self.automl_factory(
                    self.task,
                    timer.time_left,
                    memory_limit=self.memory_limit,
                    cpu_limit=self.cpu_limit,
                    gpu_ids=self.gpu_ids,
                    timing_params=self.timing_params,
                    config_path=config,
                    **random_states,
                    **cur_kwargs,
                )

                val_pred = automl.fit_predict(
                    train_data,
                    roles,
                    train_features,
                    cv_iter,
                    valid_data,
                    valid_features,
                    verbose=verbose,
                    log_file=log_file,
                )

                current_class_mapping = automl.reader.class_mapping

                if class_mapping is None:
                    class_mapping = current_class_mapping
                else:
                    assert (
                        class_mapping == current_class_mapping
                    ), "class_mappings are different for different AutoML for some reason."

                logger.info("=" * 50)

                amls[n_cfg].append(MLPipeForAutoMLWrapper.from_automl(automl))
                aml_preds[n_cfg].append(val_pred)

                history.append(timer.time_spent - sum(history))
                if timer.time_left < (sum(history) / len(history)) or upd_state_val >= (
                    self.max_runs_per_config * len(self.configs_list)
                ):
                    flg_continute = False
                    break

        # usually last model will be not complete due to timeout.
        # Maybe it's better to remove it from inner blend, which is typically just mean of models
        if n_ms > 1 and self.drop_last:
            amls[n_cfg].pop()
            aml_preds[n_cfg].pop()

        # prune empty algos
        amls = [x for x in amls if len(x) > 0]
        aml_preds = [x for x in aml_preds if len(x) > 0]

        # blend - first is inner blend - we blend same config with different states
        inner_pipes = []
        inner_preds = []

        for preds, pipes in zip(aml_preds, amls):
            inner_blend = deepcopy(self.inner_blend)
            val_pred, inner_pipe = inner_blend.fit_predict(preds, pipes, class_mapping=class_mapping)
            inner_pipe = [x.ml_algos[0].models[0] for x in inner_pipe]

            inner_preds.append(val_pred)
            inner_pipes.append(MLPipeForAutoMLWrapper.from_blended(inner_pipe, inner_blend))

        # outer blend - blend of blends
        if not self.return_all_predictions:
            val_pred, self.outer_pipes = self.outer_blend.fit_predict(
                inner_preds, inner_pipes, class_mapping=class_mapping
            )
        else:
            val_pred = concatenate(inner_preds)
            self.outer_pipes = inner_pipes

        self.targets_order = sorted(class_mapping, key=class_mapping.get, reverse=False) if class_mapping else None

        # saving automl model with joblib
        if path_to_save is not None:
            # There is 1 parameter for model save:
            # "path_to_save" - name of model for saving.

            dump_name = path_to_save if path_to_save.endswith(".joblib") else f"{path_to_save}.joblib"
            dump(self, dump_name, compress=0)

        return val_pred



[docs]
    def predict(
        self,
        data: Any,
        features_names: Optional[Sequence[str]] = None,
        return_all_predictions: Optional[bool] = None,
        **kwargs,
    ) -> LAMLDataset:
        """Get dataset with predictions.

        Almost same as :meth:`lightautoml.automl.base.AutoML.predict`
        on new dataset, with additional features.

        Additional features - working with different data formats.
        Supported now:

            - Path to ``.csv``, ``.parquet``, ``.feather`` files.
            - :class:`~numpy.ndarray`, or dict of :class:`~numpy.ndarray`. For example,
              ``{'data': X...}``. In this case roles are optional,
              but `train_features` and `valid_features` required.
            - :class:`pandas.DataFrame`.

        Args:
            data: Dataset to perform inference.
            features_names: Optional features names,
                if cannot be inferred from `train_data`.
            return_all_predictions: bool - skip blending phase
            **kwargs: Other params.

        Returns:
            Dataset with predictions.

        """
        if return_all_predictions is None or self.return_all_predictions:
            return_all_predictions = self.return_all_predictions

        outer_preds = []

        for amls_pipe in self.outer_pipes:

            inner_preds = []
            # TODO: Maybe refactor?
            for automl in amls_pipe.ml_algos[0].models[0]:
                inner_pred = automl.predict(data, features_names, **kwargs)
                inner_preds.append(inner_pred)

            outer_pred = amls_pipe.blender.predict(inner_preds)
            outer_preds.append(outer_pred)

        # pred = self.outer_blend.predict(outer_preds)

        if not return_all_predictions:
            pred = self.outer_blend.predict(outer_preds)
        else:
            pred = concatenate(outer_preds)

        return pred