Source code for lightautoml.automl.presets.base

"""AutoML presets base class."""

import logging
import os
import shutil

from typing import Any
from typing import Iterable
from typing import Optional
from typing import Sequence

import torch
import yaml

from ...dataset.base import LAMLDataset
from ...tasks import Task
from ...utils.logging import add_filehandler
from ...utils.logging import set_stdout_level
from ...utils.logging import verbosity_to_loglevel
from ...utils.timer import PipelineTimer
from ..base import AutoML


logger = logging.getLogger(__name__)

base_dir = os.path.dirname(__file__)


def upd_params(old: dict, new: dict) -> dict:
    """Update dictionary of parameters.

    Args:
        old: Old parameters.
        new: Changes.

    Returns:
        Updated parameters.

    """
    not_updatable_params = ["scheduler_params"]
    for k in new:
        if type(new[k]) is dict and k in old and type(old[k]) is dict and k not in not_updatable_params:
            upd_params(old[k], new[k])
        else:
            old[k] = new[k]

    return old



[docs]
class AutoMLPreset(AutoML):
    """Basic class for automl preset.

    It's almost like AutoML, but with delayed initialization.
    Initialization starts on fit, some params are inferred from data.
    Preset should be defined via ``.create_automl`` method. Params should be set via yaml config.
    Most useful case - end-to-end model development.

    Commonly _params kwargs (ex. timing_params) set via config file (config_path argument).
    If you need to change just few params, it's possible to pass it as dict of dicts, like json.
    To get available params please look on default config template. Also you can find there param description.
    To generate config template call ``SomePreset.get_config('config_path.yml')``.

    Example:
        >>> automl = SomePreset(Task('binary'), timeout=3600)
        >>> automl.fit_predict(data, roles={'target': 'TARGET'})

    Args:
        task: Task to solve.
        timeout: Timeout in seconds.
        memory_limit: Memory limit that are passed to each automl.
        cpu_limit: CPU limit that that are passed to each automl.
        gpu_ids: GPU IDs that are passed to each automl.
        verbose: Controls the verbosity: the higher, the more messages.
            <1  : messages are not displayed;
            >=1 : the computation process for layers is displayed;
            >=2 : the information about folds processing is also displayed;
            >=3 : the hyperparameters optimization process is also displayed;
            >=4 : the training process for every algorithm is displayed;
        timing_params: Timing param dict.
        config_path: Path to config file.
        **kwargs: Not used.

    """

    _default_config_path = "example_config.yml"

    def __init__(
        self,
        task: Task,
        timeout: int = 3600,
        memory_limit: int = 16,
        cpu_limit: int = 4,
        gpu_ids: Optional[str] = "all",
        debug: bool = False,
        timing_params: Optional[dict] = None,
        config_path: Optional[str] = None,
        **kwargs: Any,
    ):
        self._set_config(config_path)

        for name, param in zip(["timing_params"], [timing_params]):
            if param is None:
                param = {}
            self.__dict__[name] = {**self.__dict__[name], **param}

        self.debug = debug
        self.timer = PipelineTimer(timeout, **getattr(self, "timing_params"))
        self.memory_limit = memory_limit
        if cpu_limit == -1:
            cpu_limit = os.cpu_count()
        self.cpu_limit = cpu_limit
        self.gpu_ids = gpu_ids
        if gpu_ids == "all":
            self.gpu_ids = ",".join(map(str, range(torch.cuda.device_count())))
        self.task = task

    def _set_config(self, path):
        self.config_path = path

        if path is None:
            path = os.path.join(base_dir, self._default_config_path)

        with open(path) as f:
            params = yaml.safe_load(f)

        for k in params:
            self.__dict__[k] = params[k]


[docs]
    @classmethod
    def get_config(cls, path: Optional[str] = None) -> Optional[dict]:
        """Create new config template.

        Args:
            path: Path to config.

        Returns:
            Config.

        """
        if path is None:
            path = os.path.join(base_dir, cls._default_config_path)
            with open(path) as f:
                params = yaml.safe_load(f)
            return params

        else:
            shutil.copy(os.path.join(base_dir, cls._default_config_path), path)



[docs]
    def create_automl(self, **fit_args):
        """Abstract method - how to build automl.

        Here you should create all automl components,
        like readers, levels, timers, blenders.
        Method ``._initialize`` should be called in the end to create automl.

        Args:
            **fit_args: params that are passed to ``.fit_predict`` method.

        """
        raise NotImplementedError



[docs]
    def fit_predict(
        self,
        train_data: Any,
        roles: dict,
        train_features: Optional[Sequence[str]] = None,
        cv_iter: Optional[Iterable] = None,
        valid_data: Optional[Any] = None,
        valid_features: Optional[Sequence[str]] = None,
        verbose: int = 0,
        path_to_save: Optional[str] = None,
    ) -> LAMLDataset:
        """Fit on input data and make prediction on validation part.

        Args:
            train_data: Dataset to train.
            roles: Roles dict.
            train_features: Features names,
                if can't be inferred from `train_data`.
            cv_iter: Custom cv-iterator. For example,
                :class:`~lightautoml.validation.np_iterators.TimeSeriesIterator`.
            valid_data: Optional validation dataset.
            valid_features: Optional validation dataset features if can't be
                inferred from `valid_data`.
            verbose: Verbosity level that are passed to each automl.
            path_to_save: The path that joblib will use to save the model after fit stage is completed. Use *.joblib format.

        Returns:
            Dataset with predictions. Call ``.data`` to get predictions array.

        """
        self.set_verbosity_level(verbose)

        self.create_automl(
            train_data=train_data,
            roles=roles,
            train_features=train_features,
            cv_iter=cv_iter,
            valid_data=valid_data,
            valid_features=valid_features,
        )
        logger.info(f"Task: {self.task.name}\n")

        logger.info("Start automl preset with listed constraints:")
        logger.info(f"- time: {self.timer.timeout:.2f} seconds")
        logger.info(f"- CPU: {self.cpu_limit} cores")
        logger.info(f"- memory: {self.memory_limit} GB\n")

        self.timer.start()
        result = super().fit_predict(
            train_data,
            roles,
            train_features,
            cv_iter,
            valid_data,
            valid_features,
            verbose=verbose,
            path_to_save=path_to_save,
        )

        logger.info("\x1b[1mAutoml preset training completed in {:.2f} seconds\x1b[0m\n".format(self.timer.time_spent))
        logger.info(f"Model description:\n{self.create_model_str_desc()}\n")

        return result


    def create_model_str_desc(self, pref_tab_num: int = 0, split_line_len: int = 0) -> str:  # noqa: D102
        prefix = "\t" * pref_tab_num
        splitter = prefix + "=" * split_line_len + "\n"
        model_stats = sorted(list(self.collect_model_stats().items()))

        last_lvl = model_stats[-1][0].split("_")[1]
        last_lvl_models = [ms for ms in model_stats if ms[0].startswith("Lvl_" + last_lvl)]
        notlast_lvl_models = [ms for ms in model_stats if not ms[0].startswith("Lvl_" + last_lvl)]

        res = ""
        if len(notlast_lvl_models) > 0:
            cur_level = 0
            res += prefix + "Models on level 0:\n"
            for model_stat in notlast_lvl_models:
                model_name, cnt_folds = model_stat
                level = int(model_name.split("_")[1])
                if level != cur_level:
                    cur_level = level
                    res += "\n" + prefix + "Models on level {}:\n".format(cur_level)
                res += prefix + "\t {} averaged models {}\n".format(cnt_folds, model_name)
            res += "\n"

        res += prefix + "Final prediction for new objects (level {}) = \n".format(last_lvl)
        for model_stat, weight in zip(last_lvl_models, self.blender.wts):
            model_name, cnt_folds = model_stat
            res += prefix + "\t {:.5f} * ({} averaged models {}) +\n".format(weight, cnt_folds, model_name)

        if split_line_len == 0:
            return res[:-2]

        return splitter + res[:-2] + "\n" + splitter


[docs]
    @staticmethod
    def set_verbosity_level(verbose: int):
        """Verbosity level setter.

        Args:
            verbose: Controls the verbosity: the higher, the more messages.
                <1  : messages are not displayed;
                >=1 : the computation process for layers is displayed;
                >=2 : the information about folds processing is also displayed;
                >=3 : the hyperparameters optimization process is also displayed;
                >=4 : the training process for every algorithm is displayed;

        """
        level = verbosity_to_loglevel(verbose)
        set_stdout_level(level)

        logger.info(f"Stdout logging level is {logging._levelToName[level]}.")


    @staticmethod
    def set_logfile(filename: str):  # noqa: D102
        add_filehandler(filename)