Source code for lightautoml.pipelines.selection.linear_selector

"""Selectors for linear models."""

from typing import Optional
from typing import Union

import numpy as np

from scipy.sparse import linalg as sp_linalg

from ...validation.base import TrainValidIterator
from .base import SelectionPipeline


[docs]class HighCorrRemoval(SelectionPipeline): """Selector to remove highly correlated features. Del totally correlated feats to speedup L1 regression models. For sparse data cosine will be used. It's not exact, but ok for remove very high correlations. Args: corr_co: Similarity threshold. subsample: Number (int) of samples, or frac (float) from full dataset. random_state: Random seed for subsample. **kwargs: Addtional parameters. Used for initialiation of parent class. """ def __init__(self, corr_co: float = 0.98, subsample: Union[int, float] = 100000, random_state: int = 42, **kwargs): super().__init__(**kwargs) self.corr_co = corr_co self.subsample = subsample self.random_state = random_state
[docs] def perform_selection(self, train_valid: Optional[TrainValidIterator]): """Select features to save in dataset during selection. Method is used to perform selection based on features correlation. Should save ``_selected_features`` attribute in the end of working. Args: train_valid: Classic cv-iterator. """ train = train_valid.train.data target = train_valid.train.target if train.shape[1] == 1: self._selected_features = train_valid.features return if self.subsample != 1 and self.subsample < train.shape[0]: if self.subsample < 1: subsample = int(train.shape[0] * self.subsample) else: subsample = int(self.subsample) idx = np.random.RandomState(self.random_state + 1).permutation(train.shape[0])[:subsample] train, target = train[idx], target[idx] # correlation or cosine if type(train) is np.ndarray: corr = np.corrcoef(train, rowvar=False) else: xtx = train.T * train norm = sp_linalg.norm(train, axis=0) corr = np.array(xtx / (norm[:, np.newaxis] * norm[np.newaxis, :])) del xtx sl = np.triu(np.abs(corr) > self.corr_co, k=1) grid_x, grid_y = np.meshgrid(np.arange(sl.shape[0]), np.arange(sl.shape[0])) removed = set() for x, y in zip(grid_x[sl], grid_y[sl]): if x not in removed: removed.add(y) const = np.arange(corr.shape[0])[np.isnan(np.diagonal(corr))] for i in const: removed.add(i) self._selected_features = [x for (n, x) in enumerate(train_valid.features) if n not in removed]