Source code for lightautoml.ml_algo.whitebox

"""AutoMLWhitebox for tabular datasets."""

import warnings

from copy import copy
from copy import deepcopy
from typing import Optional
from typing import Tuple
from typing import Union

import autowoe
import numpy as np

from pandas import DataFrame

from ..dataset.np_pd_dataset import NumpyDataset
from ..dataset.np_pd_dataset import PandasDataset
from ..validation.base import TrainValidIterator
from .base import TabularMLAlgo


WbModel = Union[autowoe.AutoWoE, autowoe.ReportDeco]



[docs]
class WbMLAlgo(TabularMLAlgo):
    """WhiteBox - scorecard model.

    https://github.com/AILab-MLTools/AutoMLWhitebox

    default_params:

       - monotonic: bool
           Global condition for monotonic constraints.
           If ``True``, then only monotonic binnings will be built.
           You can pass values to the ``.fit``
           method that change this condition
           separately for each feature.
       - max_bin_count: int
           Global limit for the number of bins. Can be specified for every
           feature in .fit
       - select_type: ``None`` or ``int``
           The type to specify the primary feature selection.
           If the type is an integer, then we select the number
           of features indicated by this number
           (with the best `feature_importance`).
           If the value is ``None``, we leave only features
           with ``feature_importance`` greater than ``0``.
       - pearson_th:  0 < pearson_th < 1
           Threshold for feature selection by correlation. All features with
           the absolute value of correlation coefficient greater then
           pearson_th will be discarded.
       - metric_th: .5 < metric_th < 1
           Threshold for feature selection by one-dimensional AUC.
           WoE with AUC < metric_th will be discarded.
       - vif_th: vif_th > 0
           Threshold for feature selection by VIF. Features with VIF > vif_th
           are iteratively discarded one by one, then VIF is recalculated
           until all VIFs are less than vif_th.
       - imp_th: real >= 0
           Threshold for feature selection by feature importance
       - th_const:
           Threshold, which determines that the feature is constant.
           If the number of valid values is greater than the threshold, then
           the column is not constant. For float, the number of
           valid values will be calculated as the sample size * th_const
       - force_single_split: bool
           In the tree parameters, you can set the minimum number of
           observations in the leaf. Thus, for some features,
           splitting for 2 beans at least will be impossible.
           If you specify that ``force_single_split = True``,
           it means that 1 split will be created for the feature,
           if the minimum bin size is greater than th_const.
       - th_nan: int >= 0
           Threshold, which determines that WoE values are calculated to NaN.
       - th_cat: int >= 0
           Threshold, which determines which categories are small.
       - woe_diff_th: float = 0.01
           The option to merge NaNs and rare categories with another bin,
           if the difference in WoE is less than woe_diff_th.
       - min_bin_size: int > 1, 0 < float < 1
           Minimum bin size when splitting.
       - min_bin_mults: list of floats > 1
           If minimum bin size is specified, you can specify a list to check
           if large values work better, for example: [2, 4].
       - min_gains_to_split: list of floats >= 0
           min_gain_to_split values that will be
           iterated to find the best split.
       - metric_tol: 1e-5 <= metric_tol <=1e-2
           Metric tolerance. You can lower the metric_tol value from the maximum
           to make the model simpler.
       - cat_alpha: float > 0
           Regularizer for category encoding.
       - cat_merge_to: str
           The way of WoE values filling in the test sample for categories
           that are not in the training sample.
           Values - 'to_nan', 'to_woe_0', 'to_maxfreq', 'to_maxp', 'to_minp'
       - nan_merge_to: str
           The way of WoE values filling on the test sample for real NaNs,
           if they are not included in their group.
           Values - 'to_woe_0', 'to_maxfreq', 'to_maxp', 'to_minp'
       - oof_woe: bool
           Use OOF or standard encoding for WOE.
       - n_folds: int
           Number of folds for feature selection / encoding, etc.
       - n_jobs: int > 0
           Number of CPU cores to run in parallel.
       - l1_base_step: real > 0
           Grid size in l1 regularization
       - l1_exp_step: real > 1
           Grid scale in l1 regularization
       - population_size: None, int > 0
           Feature selection type in the selector.
           If the value is ``None`` then L1 boost is used.
           If ``int`` is specified, then a standard step will be used for
           the number of random subsamples indicated by this value.
           Can be generalized to genetic algorithm.
       - feature_groups_count: int > 0
           The number of groups in the genetic algorithm.
           Its effect is visible only when population_size > 0
       - imp_type: str
           Feature importances type. Feature_imp and perm_imp are available.
           It is used to sort the features at the first and at the final
           stage of feature selection.
       - regularized_refit: bool
           Use regularization at the time of model refit. Otherwise, we have
           a statistical model.
       - p_val: 0 < p_val <= 1
           When training a statistical model, do backward selection
           until all p-values of the model's coefficient are
       - verbose: int 0-3
           Verbosity level

    freeze_defaults:
        - ``True`` :  params can be rewritten depending on dataset.
        - ``False``:  params can be changed only manually or with tuning.

    timer: :class:`~lightautoml.utils.timer.Timer` instance or ``None``.

    """

    _name: str = "WhiteBox"

    _default_params = {
        "monotonic": False,
        "max_bin_count": 5,
        "select_type": None,
        "pearson_th": 0.9,
        "metric_th": 0.505,
        "vif_th": 10.0,
        "imp_th": 0,
        "th_const": 32,
        "force_single_split": True,
        "th_nan": 0.01,
        "th_cat": 0.005,
        "woe_diff_th": 0.01,
        "min_bin_size": 0.01,
        "cat_alpha": 100,
        "cat_merge_to": "to_woe_0",
        "nan_merge_to": "to_woe_0",
        "oof_woe": True,
        "n_folds": 6,
        "n_jobs": 4,
        "l1_grid_size": 20,
        "l1_exp_scale": 6,
        "imp_type": "feature_imp",
        "regularized_refit": False,
        "p_val": 0.05,
        "report": False,
        "verbose": 0,
    }

    _report_on_inference = False

    def _infer_params(self) -> Tuple[dict, bool, dict]:

        params = deepcopy(self.params)
        report = params.pop("report")
        fit_params = params.pop("fit_params")
        self._report_on_inference = report
        return params, report, fit_params

    def fit_predict(self, train_valid_iterator: TrainValidIterator, **kwargs) -> NumpyDataset:  # noqa: D102

        self._dataset_fit_params = kwargs

        return super().fit_predict(train_valid_iterator)

    def _include_target(self, dataset: PandasDataset, include_group: bool = False) -> Tuple[DataFrame, Optional[str]]:

        df = dataset.data.copy()
        if dataset.target is not None:
            df["__TARGET__"], _ = self.task.losses["lgb"].fw_func(dataset.target.values, None)
        group_kf = None

        if include_group and dataset.group is not None:
            assert "__GROUP__" not in dataset.features, "__GROUP__ is not valid column name for WhiteBox"
            df["__GROUP__"] = dataset.group.values
            group_kf = "__GROUP__"

        return df, group_kf


[docs]
    def fit_predict_single_fold(self, train: PandasDataset, valid: PandasDataset) -> Tuple[WbModel, np.ndarray]:
        """Implements training and prediction on single fold.

        Args:
            train: Train Dataset.
            valid: Validation Dataset.

        Returns:
            Tuple (model, predicted_values).

        """
        params, report, fit_params = self._infer_params()

        assert train.task.name == "binary", "Only binary task is supported"
        assert "__TARGET__" not in train.features, "__TARGET__ is not valid column name for WhiteBox"
        if train.weights is not None:
            warnings.warn("Weights are ignored at the moment", UserWarning, stacklevel=2)

        train_df, group_kf = self._include_target(train, True)

        roles = train.roles
        mapping = {"Category": "cat", "Numeric": "real"}
        features_type = {x: mapping[roles[x].name] for x in roles}

        valid_df = None
        if train is not valid:
            valid_df, _ = self._include_target(valid, False)

        model = autowoe.AutoWoE(**params)

        if report:
            model = autowoe.ReportDeco(model)

        kwargs = copy(self._dataset_fit_params)
        kwargs["validation"] = valid_df
        kwargs = {**kwargs, **fit_params}

        model.fit(train_df, target_name="__TARGET__", group_kf=group_kf, features_type=features_type, **kwargs)

        if train is valid:
            valid_df = train_df

        val_pred = model.predict_proba(valid_df)
        val_pred = self.task.losses["lgb"].bw_func(val_pred)

        return model, val_pred



[docs]
    def predict_single_fold(self, model: WbModel, dataset: PandasDataset) -> np.ndarray:
        """Predict target values for dataset.

        Args:
            model: WhiteBox model
            dataset: Test dataset.

        Returns:
            Predicted target values.

        """
        args = []
        if self.params["report"]:
            args = [self._report_on_inference]

        df, _ = self._include_target(dataset, False)
        pred = self.task.losses["lgb"].bw_func(model.predict_proba(df, *args))

        return pred



[docs]
    def fit(self, train_valid: TrainValidIterator):
        """Just to be compatible with ImportanceEstimator.

        Args:
            train_valid: classic cv iterator.

        """
        self.fit_predict(train_valid)



[docs]
    def predict(self, dataset: PandasDataset, report: bool = False) -> NumpyDataset:
        """Predict on new dataset.

        Args:
            dataset: Dataset.
            report: Flag to generate report.

        Returns:
            Dataset with predictions.

        """
        self._report_on_inference = report
        return super().predict(dataset)