Source code for lightautoml.transformers.decomposition

"""Dimension reduction transformers."""

from typing import List
from typing import Optional
from typing import Union

import numpy as np

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from ..dataset.base import LAMLDataset
from ..dataset.np_pd_dataset import CSRSparseDataset
from ..dataset.np_pd_dataset import NumpyDataset
from ..dataset.np_pd_dataset import PandasDataset
from ..dataset.roles import NumericRole
from .base import LAMLTransformer


# type - something that can be converted to pandas dataset
NumpyTransformable = Union[NumpyDataset, PandasDataset]
NumpyCSR = Union[NumpyDataset, CSRSparseDataset]


# TODO: move all checks to the utils
def numeric_check(dataset: LAMLDataset):
    """Check if all passed vars are categories.

    Args:
        dataset: Dataset to check.

    Raises:
        AssertionError: If there is non number role.

    """
    roles = dataset.roles
    features = dataset.features
    for f in features:
        assert roles[f].name == "Numeric", "Only numbers accepted in this transformer"


# TODO: merge into one transformer
[docs]class PCATransformer(LAMLTransformer): """PCA. Args: subs: Subsample to fit algorithm. If None - full data. random_state: Random state to take subsample. n_components: Number of PCA components """ _fit_checks = (numeric_check,) _transform_checks = () _fname_prefix = "pca" @property def features(self) -> List[str]: """Features list.""" return self._features def __init__( self, subs: Optional[int] = None, random_state: int = 42, n_components: int = 500, ): self.subs = subs self.random_state = random_state self.n_components = n_components self._pca = PCA self.pca = None
[docs] def fit(self, dataset: NumpyTransformable): """Fit algorithm on dataset. Args: dataset: Sparse or Numpy dataset of text features. Returns: Self. """ # set transformer names and add checks for check_func in self._fit_checks: check_func(dataset) # set transformer features # convert to accepted dtype and get attributes dataset = dataset.to_numpy() data = dataset.data self.n_components = np.minimum(self.n_components, data.shape[1] - 1) self.pca = self._pca(n_components=self.n_components, random_state=self.random_state) self.pca.fit(data) orig_name = dataset.features[0].split("__")[-1] feats = ( np.char.array([self._fname_prefix + "_"]) + np.arange(self.n_components).astype(str) + np.char.array(["__" + orig_name]) ) self._features = list(feats) return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset: """Transform input dataset to PCA representation. Args: dataset: Pandas or Numpy dataset of text features. Returns: Numpy dataset with text embeddings. """ # checks here super().transform(dataset) # convert to accepted dtype and get attributes dataset = dataset.to_numpy() data = dataset.data # transform data = self.pca.transform(data) # create resulted output = dataset.empty().to_numpy() output.set_data(data, self.features, NumericRole(np.float32)) return output
[docs]class SVDTransformer(LAMLTransformer): """TruncatedSVD. Args: subs: Subsample to fit algorithm. If None - full data. random_state: Random state to take subsample. n_components: Number of SVD components. """ _fit_checks = (numeric_check,) _transform_checks = () _fname_prefix = "svd" @property def features(self) -> List[str]: """Features list.""" return self._features def __init__( self, subs: Optional[int] = None, random_state: int = 42, n_components: int = 100, ): self.subs = subs self.random_state = random_state self.n_components = n_components self._svd = TruncatedSVD self.svd = None
[docs] def fit(self, dataset: NumpyCSR): """Fit algorithm on dataset. Args: dataset: Sparse or Numpy dataset of text features. Returns: self. """ # set transformer names and add checks for check_func in self._fit_checks: check_func(dataset) # set transformer features # convert to accepted dtype and get attributes data = dataset.data self.n_components = np.minimum(self.n_components, data.shape[1] - 1) self.svd = self._svd(n_components=self.n_components, random_state=self.random_state) self.svd.fit(data) orig_name = dataset.features[0].split("__")[-1] feats = ( np.char.array([self._fname_prefix + "_"]) + np.arange(self.n_components).astype(str) + np.char.array(["__" + orig_name]) ) self._features = list(feats) return self
[docs] def transform(self, dataset: NumpyCSR) -> NumpyDataset: """Transform input dataset to SVD representation. Args: dataset: Sparse or Numpy dataset of text features. Returns: Numpy dataset with text embeddings. """ # checks here super().transform(dataset) # convert to accepted dtype and get attributes data = dataset.data # transform data = self.svd.transform(data) # create resulted output = dataset.empty().to_numpy() output.set_data(data, self.features, NumericRole(np.float32)) return output