"""Iterative feature selector."""
import logging
from copy import deepcopy
from typing import Optional
import numpy as np
from pandas import Series
from lightautoml.validation.base import TrainValidIterator
from ...dataset.base import LAMLDataset
from ...ml_algo.base import MLAlgo
from ...ml_algo.utils import tune_and_fit_predict
from ..features.base import FeaturesPipeline
from .base import ImportanceEstimator
from .base import PredefinedSelector
from .base import SelectionPipeline
logger = logging.getLogger(__name__)
def _create_chunks_from_list(lst, n):
"""Creates chunks of list.
Args:
lst: List of elements.
n: Size of chunk.
Returns:
Sequential chunks.
"""
chunks = []
for i in range(0, len(lst), n):
chunks.append(lst[i : i + n])
return chunks
[docs]class NpPermutationImportanceEstimator(ImportanceEstimator):
"""Permutation importance based estimator.
Importance calculate, using random permutation
of items in single column for each feature.
Args:
random_state: seed for random generation of features permutation.
"""
def __init__(self, random_state: int = 42):
super().__init__()
self.random_state = random_state
[docs] def fit(
self,
train_valid: Optional[TrainValidIterator] = None,
ml_algo: Optional[MLAlgo] = None,
preds: Optional[LAMLDataset] = None,
):
"""Find importances for each feature in dataset.
Args:
train_valid: Initial dataset iterator.
ml_algo: Algorithm.
preds: Predicted target values for validation dataset.
"""
normal_score = ml_algo.score(preds)
logger.debug("Normal score = {}".format(normal_score))
valid_data = train_valid.get_validation_data()
valid_data = valid_data.to_numpy()
permutation = np.random.RandomState(seed=self.random_state + 1).permutation(valid_data.shape[0])
permutation_importance = {}
for it, col in enumerate(valid_data.features):
logger.debug("Start processing ({},{})".format(it, col))
# Save initial column
save_col = deepcopy(valid_data[:, col])
# Get current column and shuffle it
shuffled_col = valid_data[permutation, col]
# Set shuffled column
logger.info3("Shuffled column set")
valid_data[col] = shuffled_col
# Calculate predict and metric
logger.info3("Shuffled column set")
new_preds = ml_algo.predict(valid_data)
shuffled_score = ml_algo.score(new_preds)
logger.debug(
"Shuffled score for col {} = {}, difference with normal = {}".format(
col, shuffled_score, normal_score - shuffled_score
)
)
permutation_importance[col] = normal_score - shuffled_score
# Set normal column back to the dataset
logger.debug("Normal column set")
valid_data[col] = save_col
self.raw_importances = Series(permutation_importance).sort_values(ascending=False)
[docs]class NpIterativeFeatureSelector(SelectionPipeline):
"""Select features sequentially using chunks to find the best combination of chunks.
The general idea of this algorithm is to sequentially
check groups of features ordered by feature importances and
if the quality of the model becomes better,
we select such group, if not - ignore group.
Args:
feature_pipeline: Composition of feature transforms.
ml_algo: Tuple (MlAlgo, ParamsTuner).
imp_estimator: Feature importance estimator.
fit_on_holdout: If use the holdout iterator.
feature_group_size: Chunk size.
max_features_cnt_in_result: Lower bound of features after selection,
if it is reached, it will stop.
"""
def __init__(
self,
feature_pipeline: FeaturesPipeline,
ml_algo: Optional[MLAlgo] = None,
imp_estimator: Optional[ImportanceEstimator] = None,
fit_on_holdout: bool = True,
feature_group_size: Optional[int] = 5,
max_features_cnt_in_result: Optional[int] = None,
):
if not fit_on_holdout:
logger.info2("This selector only for holdout training. fit_on_holout argument added just to be compatible")
super().__init__(feature_pipeline, ml_algo, imp_estimator, True)
self.feature_group_size = feature_group_size
self.max_features_cnt_in_result = max_features_cnt_in_result