Source code for lightautoml.pipelines.selection.linear_selector
"""Selectors for linear models."""
from typing import Optional
from typing import Union
import numpy as np
from scipy.sparse import linalg as sp_linalg
from ...validation.base import TrainValidIterator
from .base import SelectionPipeline
[docs]class HighCorrRemoval(SelectionPipeline):
"""Selector to remove highly correlated features.
Del totally correlated feats to speedup L1 regression models.
For sparse data cosine will be used.
It's not exact, but ok for remove very high correlations.
"""
[docs] def __init__(self, corr_co: float = 0.98, subsample: Union[int, float] = 100000, random_state: int = 42, **kwargs):
"""
Args:
corr_co: Similarity threshold.
subsample: Number (int) of samples, or frac (float) from full dataset.
random_state: Random seed for subsample.
**kwargs: Addtional parameters. Used for initialiation of parent class.
"""
super().__init__(**kwargs)
self.corr_co = corr_co
self.subsample = subsample
self.random_state = random_state
[docs] def perform_selection(self, train_valid: Optional[TrainValidIterator]):
"""Select features to save in dataset during selection.
Method is used to perform selection based on features correlation.
Should save ``_selected_features`` attribute in the end of working.
Args:
train_valid: Classic cv-iterator.
"""
train = train_valid.train.data
target = train_valid.train.target
if train.shape[1] == 1:
self._selected_features = train_valid.features
return
if self.subsample != 1 and self.subsample < train.shape[0]:
if self.subsample < 1:
subsample = int(train.shape[0] * self.subsample)
else:
subsample = int(self.subsample)
idx = np.random.RandomState(self.random_state).permutation(train.shape[0])[:subsample]
train, target = train[idx], target[idx]
# correlation or cosine
if type(train) is np.ndarray:
corr = np.corrcoef(train, rowvar=False)
else:
xtx = train.T * train
norm = sp_linalg.norm(train, axis=0)
corr = np.array(xtx / (norm[:, np.newaxis] * norm[np.newaxis, :]))
del xtx
sl = np.triu(np.abs(corr) > self.corr_co, k=1)
grid_x, grid_y = np.meshgrid(np.arange(sl.shape[0]), np.arange(sl.shape[0]))
removed = set()
for x, y in zip(grid_x[sl], grid_y[sl]):
if x not in removed:
removed.add(y)
const = np.arange(corr.shape[0])[np.isnan(np.diagonal(corr))]
for i in const:
removed.add(i)
self._selected_features = [x for (n, x) in enumerate(train_valid.features) if n not in removed]