Source code for lightautoml.automl.base

"""Base AutoML class."""

import logging

from typing import Any
from typing import Dict
from typing import Iterable
from typing import List
from typing import Optional
from typing import Sequence

from ..dataset.base import LAMLDataset
from ..dataset.utils import concatenate
from ..pipelines.ml.base import MLPipeline
from ..reader.base import Reader
from ..utils.logging import set_stdout_level
from ..utils.logging import verbosity_to_loglevel
from ..utils.timer import PipelineTimer
from ..validation.utils import create_validation_iterator
from .blend import BestModelSelector
from .blend import Blender


logger = logging.getLogger(__name__)


[docs]class AutoML: """Class for compile full pipeline of AutoML task. AutoML steps: - Read, analyze data and get inner :class:`~lightautoml.dataset.base.LAMLDataset` from input dataset: performed by reader. - Create validation scheme. - Compute passed ml pipelines from levels. Each element of levels is list of :class:`~lightautoml.pipelines.ml.base.MLPipelines` prediction from current level are passed to next level pipelines as features. - Time monitoring - check if we have enough time to calc new pipeline. - Blend last level models and prune useless pipelines to speedup inference: performed by blender. - Returns prediction on validation data. If crossvalidation scheme is used, out-of-fold prediction will returned. If validation data is passed it will return prediction on validation dataset. In case of cv scheme when some point of train data never was used as validation (ex. timeout exceeded or custom cv iterator like :class:`~lightautoml.validation.np_iterators.TimeSeriesIterator` was used) NaN for this point will be returned. Example: Common usecase - create custom pipelines or presets. >>> reader = SomeReader() >>> pipe = MLPipeline([SomeAlgo()]) >>> levels = [[pipe]] >>> automl = AutoML(reader, levels, ) >>> automl.fit_predict(data, roles={'target': 'TARGET'}) Args: reader: Instance of Reader class object that creates :class:`~lightautoml.dataset.base.LAMLDataset` from input data. levels: List of list of :class:`~lightautoml.pipelines.ml..base.MLPipelines`. timer: Timer instance of :class:`~lightautoml.utils.timer.PipelineTimer`. Default - unlimited timer. blender: Instance of Blender. Default - :class:`~lightautoml.automl.blend.BestModelSelector`. skip_conn: True if we should pass first level input features to next levels. Note: There are several verbosity levels: - `0`: No messages. - `1`: Warnings. - `2`: Info. - `3`: Debug. """ def __init__( self, reader: Reader, levels: Sequence[Sequence[MLPipeline]], timer: Optional[PipelineTimer] = None, blender: Optional[Blender] = None, skip_conn: bool = False, return_all_predictions: bool = False, debug: bool = False, ): self._initialize(reader, levels, timer, blender, skip_conn, return_all_predictions, debug) def _initialize( self, reader: Reader, levels: Sequence[Sequence[MLPipeline]], timer: Optional[PipelineTimer] = None, blender: Optional[Blender] = None, skip_conn: bool = False, return_all_predictions: bool = False, debug: bool = False, ): """Same as __init__. Exists for delayed initialization in presets. Args: reader: Instance of Reader class object that creates :class:`~lightautoml.dataset.base.LAMLDataset` from input data. levels: List of list of :class:`~lightautoml.pipelines.ml..base.MLPipelines`. timer: Timer instance of :class:`~lightautoml.utils.timer.PipelineTimer`. Default - unlimited timer. blender: Instance of Blender. Default - :class:`~lightautoml.automl.blend.BestModelSelector`. skip_conn: True if we should pass first level input features to next levels. return_all_predictions: True if we should return all predictions from last level models. debug: To catch running model exceptions or not. - ``True`` : show exceptions during model training. - ``False``: catch and hide exceptions. """ assert len(levels) > 0, "At least 1 level should be defined" self.timer = timer if timer is None: self.timer = PipelineTimer() self.reader = reader self._levels = levels # default blender is - select best model and prune other pipes self.blender = blender if blender is None: self.blender = BestModelSelector() # update model names for i, lvl in enumerate(self._levels): for j, pipe in enumerate(lvl): pipe.upd_model_names("Lvl_{0}_Pipe_{1}".format(i, j)) self.skip_conn = skip_conn self.return_all_predictions = return_all_predictions self.debug = debug
[docs] def fit_predict( self, train_data: Any, roles: dict, train_features: Optional[Sequence[str]] = None, cv_iter: Optional[Iterable] = None, valid_data: Optional[Any] = None, valid_features: Optional[Sequence[str]] = None, verbose: int = 0, ) -> LAMLDataset: """Fit on input data and make prediction on validation part. Args: train_data: Dataset to train. roles: Roles dict. train_features: Optional features names, if cannot be inferred from train_data. cv_iter: Custom cv iterator. For example, :class:`~lightautoml.validation.np_iterators.TimeSeriesIterator`. valid_data: Optional validation dataset. valid_features: Optional validation dataset features if can't be inferred from `valid_data`. verbose: Controls the verbosity: the higher, the more messages. <1 : messages are not displayed; >=1 : the computation process for layers is displayed; >=2 : the information about folds processing is also displayed; >=3 : the hyperparameters optimization process is also displayed; >=4 : the training process for every algorithm is displayed. Returns: Predicted values. """ set_stdout_level(verbosity_to_loglevel(verbose)) self.timer.start() train_dataset = self.reader.fit_read(train_data, train_features, roles) assert ( len(self._levels) <= 1 or train_dataset.folds is not None ), "Not possible to fit more than 1 level without cv folds" assert ( len(self._levels) <= 1 or valid_data is None ), "Not possible to fit more than 1 level with holdout validation" valid_dataset = None if valid_data is not None: valid_dataset = self.reader.read(valid_data, valid_features, add_array_attrs=True) train_valid = create_validation_iterator(train_dataset, valid_dataset, n_folds=None, cv_iter=cv_iter) # for pycharm) level_predictions = None pipes = None self.levels = [] for leven_number, level in enumerate(self._levels, 1): pipes = [] level_predictions = [] flg_last_level = leven_number == len(self._levels) logger.info( f"Layer \x1b[1m{leven_number}\x1b[0m train process start. Time left {self.timer.time_left:.2f} secs" ) for k, ml_pipe in enumerate(level): ml_pipe.debug = self.debug pipe_pred = ml_pipe.fit_predict(train_valid) level_predictions.append(pipe_pred) pipes.append(ml_pipe) logger.info("Time left {:.2f} secs\n".format(self.timer.time_left)) if self.timer.time_limit_exceeded(): logger.info( "Time limit exceeded. Last level models will be blended and unused pipelines will be pruned.\n" ) flg_last_level = True break else: if self.timer.child_out_of_time: logger.info( "Time limit exceeded in one of the tasks. AutoML will blend level {0} models.\n".format( leven_number ) ) flg_last_level = True logger.info("\x1b[1mLayer {} training completed.\x1b[0m\n".format(leven_number)) # here is split on exit condition if not flg_last_level: self.levels.append(pipes) level_predictions = concatenate(level_predictions) if self.skip_conn: valid_part = train_valid.get_validation_data() try: # convert to initital dataset type level_predictions = valid_part.from_dataset(level_predictions) except TypeError: raise TypeError( "Can not convert prediction dataset type to input features. Set skip_conn=False" ) level_predictions = concatenate([level_predictions, valid_part]) train_valid = create_validation_iterator(level_predictions, None, n_folds=None, cv_iter=None) else: break blended_prediction, last_pipes = self.blender.fit_predict(level_predictions, pipes) self.levels.append(last_pipes) self.reader.upd_used_features(remove=list(set(self.reader.used_features) - set(self.collect_used_feats()))) del self._levels if self.return_all_predictions: return concatenate(level_predictions) return blended_prediction
[docs] def predict( self, data: Any, features_names: Optional[Sequence[str]] = None, return_all_predictions: Optional[bool] = None, ) -> LAMLDataset: """Predict with automl on new dataset. Args: data: Dataset to perform inference. features_names: Optional features names, if cannot be inferred from `train_data`. return_all_predictions: if True, returns all model predictions from last level Returns: Dataset with predictions. """ dataset = self.reader.read(data, features_names=features_names, add_array_attrs=False) for n, level in enumerate(self.levels, 1): # check if last level level_predictions = [] for _n, ml_pipe in enumerate(level): level_predictions.append(ml_pipe.predict(dataset)) if n != len(self.levels): level_predictions = concatenate(level_predictions) if self.skip_conn: try: # convert to initital dataset type level_predictions = dataset.from_dataset(level_predictions) except TypeError: raise TypeError( "Can not convert prediction dataset type to input features. Set skip_conn=False" ) dataset = concatenate([level_predictions, dataset]) else: dataset = level_predictions else: if (return_all_predictions is None and self.return_all_predictions) or return_all_predictions: return concatenate(level_predictions) return self.blender.predict(level_predictions)
[docs] def collect_used_feats(self) -> List[str]: """Get feats that automl uses on inference. Returns: Features names list. """ used_feats = set() for lvl in self.levels: for pipe in lvl: used_feats.update(pipe.used_features) used_feats = list(used_feats) return used_feats
[docs] def collect_model_stats(self) -> Dict[str, int]: """Collect info about models in automl. Returns: Dict with models and its runtime numbers. """ model_stats = {} for lvl in self.levels: for pipe in lvl: for ml_algo in pipe.ml_algos: model_stats[ml_algo.name] = len(ml_algo.models) return model_stats