Source code for lightautoml.transformers.numeric
"""Numeric features transformers."""
from typing import Optional
from typing import Union
import numpy as np
from sklearn.preprocessing import QuantileTransformer as SklQntTr
from ..dataset.base import LAMLDataset
from ..dataset.np_pd_dataset import NumpyDataset
from ..dataset.np_pd_dataset import PandasDataset
from ..dataset.roles import CategoryRole
from ..dataset.roles import NumericRole
from .base import LAMLTransformer
# type - something that can be converted to pandas dataset
NumpyTransformable = Union[NumpyDataset, PandasDataset]
def numeric_check(dataset: LAMLDataset):
"""Check if all passed vars are categories.
Args:
dataset: Dataset to check.
Raises:
AssertionError: If there is non number role.
"""
roles = dataset.roles
features = dataset.features
for f in features:
assert roles[f].name == "Numeric", "Only numbers accepted in this transformer"
[docs]class NaNFlags(LAMLTransformer):
"""Create NaN flags.
Args:
nan_rate: Nan rate cutoff.
"""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "nanflg"
def __init__(self, nan_rate: float = 0.005):
self.nan_rate = nan_rate
[docs] def fit(self, dataset: NumpyTransformable):
"""Extract nan flags.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
self.
"""
# set transformer names and add checks
for check_func in self._fit_checks:
check_func(dataset)
# set transformer features
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# fit ...
ds_nan_rate = np.isnan(data).mean(axis=0)
self.nan_cols = [name for (name, nan_rate) in zip(dataset.features, ds_nan_rate) if nan_rate > self.nan_rate]
self._features = list(self.nan_cols)
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Transform - extract null flags.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
nans = dataset[:, self.nan_cols].data
# transform
new_arr = np.isnan(nans).astype(np.float32)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(new_arr, self.features, NumericRole(np.float32))
return output
[docs]class FillnaMedian(LAMLTransformer):
"""Fillna with median."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "fillnamed"
[docs] def fit(self, dataset: NumpyTransformable):
"""Estimate medians.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
self.
"""
# set transformer names and add checks
super().fit(dataset)
# set transformer features
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
self.meds = np.nanmedian(data, axis=0)
self.meds[np.isnan(self.meds)] = 0
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Transform - fillna with medians.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
data = np.where(np.isnan(data), self.meds, data)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(data, self.features, NumericRole(np.float32))
return output
[docs]class FillnaMean(LAMLTransformer):
"""Fillna with mean."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "fillnamean"
[docs] def fit(self, dataset: NumpyTransformable):
"""Estimate means.
Args:
dataset: Pandas or Numpy dataset of features.
Returns:
self.
"""
# set transformer names and add checks
super().fit(dataset)
# set transformer features
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
self.means = np.nanmean(data, axis=0)
self.means[np.isnan(self.means)] = 0
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Transform - fillna with means.
Args:
dataset: Pandas or Numpy dataset of features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
data = np.where(np.isnan(data), self.means, data)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(data, self.features, NumericRole(np.float32))
return output
[docs]class FillInf(LAMLTransformer):
"""Fill inf with nan to handle as nan value."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "fillinf"
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Replace inf to nan.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
data = np.where(np.isinf(data), np.nan, data)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(data, self.features, NumericRole(np.float32))
return output
[docs]class LogOdds(LAMLTransformer):
"""Convert probs to logodds."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "logodds"
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Transform - convert num values to logodds.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
# TODO: maybe np.exp and then cliping and logodds?
data = np.clip(data, 1e-7, 1 - 1e-7)
data = np.log(data / (1 - data))
# create resulted
output = dataset.empty().to_numpy()
output.set_data(data, self.features, NumericRole(np.float32))
return output
[docs]class StandardScaler(LAMLTransformer):
"""Classic StandardScaler."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "scaler"
[docs] def fit(self, dataset: NumpyTransformable):
"""Estimate means and stds.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
self.
"""
# set transformer names and add checks
super().fit(dataset)
# set transformer features
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
self.means = np.nanmean(data, axis=0)
self.stds = np.nanstd(data, axis=0)
# Fix zero stds to 1
self.stds[(self.stds == 0) | np.isnan(self.stds)] = 1
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Scale test data.
Args:
dataset: Pandas or Numpy dataset of numeric features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
data = (data - self.means) / self.stds
# create resulted
output = dataset.empty().to_numpy()
output.set_data(data, self.features, NumericRole(np.float32))
return output
[docs]class QuantileBinning(LAMLTransformer):
"""Discretization of numeric features by quantiles.
Args:
nbins: maximum number of bins.
"""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "qntl"
def __init__(self, nbins: int = 10):
self.nbins = nbins
[docs] def fit(self, dataset: NumpyTransformable):
"""Estimate bins borders.
Args:
dataset: Pandas or Numpy dataset of numeric features.
Returns:
self.
"""
# set transformer names and add checks
super().fit(dataset)
# set transformer features
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
sl = np.isnan(data)
grid = np.linspace(0, 1, self.nbins + 1)[1:-1]
self.bins = []
for n in range(data.shape[1]):
q = np.quantile(data[:, n][~sl[:, n]], q=grid)
q = np.unique(q)
self.bins.append(q)
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Apply bin borders.
Args:
dataset: Pandas or Numpy dataset of numeric features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
sl = np.isnan(data)
new_data = np.zeros(data.shape, dtype=np.int32)
for n, b in enumerate(self.bins):
new_data[:, n] = np.searchsorted(b, np.where(sl[:, n], np.inf, data[:, n])) + 1
new_data = np.where(sl, 0, new_data)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(new_data, self.features, CategoryRole(np.int32, label_encoded=True))
return output
[docs]class QuantileTransformer(LAMLTransformer):
"""Transform features using quantiles information."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "qntl_tr"
# TODO: Make normal docs
[docs] def __init__(
self,
n_quantiles: Optional[int] = None,
subsample: int = 1e9,
output_distribution: str = "normal",
noise: float = 1e-3,
qnt_factor: int = 30,
):
"""QuantileTransformer.
Args:
n_quantiles: Number of quantiles to be computed.
subsample: Maximum number of samples used to estimate the quantiles for computational efficiency.
output_distribution: Marginal distribution for the transformed data. The choices are 'uniform' or 'normal'.
noise: Add noise with certain std to dataset before quantile transformation to make data more smooth.
qnt_factor: If number of quantiles is none then it equals dataset size / factor
"""
self.params = {
"n_quantiles": n_quantiles,
"subsample": subsample,
"copy": False,
"output_distribution": output_distribution,
"noise": noise,
}
self.qnt_factor = qnt_factor
self.transformer = None
[docs] def fit(self, dataset: NumpyTransformable):
"""Fit Sklearn QuantileTransformer.
Args:
dataset: Pandas or Numpy dataset of numeric features.
Returns:
self.
"""
for check_func in self._fit_checks:
check_func(dataset)
np_dataset = dataset.to_numpy().data
if self.params["noise"] is not None:
stds = np.std(np_dataset, axis=0, keepdims=True)
noise_std = self.params["noise"] / np.maximum(stds, self.params["noise"])
np_dataset += noise_std * np.random.randn(*np_dataset.shape)
if self.params["n_quantiles"] is None:
self.params["n_quantiles"] = max(min(np_dataset.shape[0] // self.qnt_factor, 1000), 10)
skl_params = self.params
del skl_params["noise"]
self.transformer = SklQntTr(**skl_params)
self.transformer.fit(np_dataset)
self._features = dataset.features
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Apply transformer.
Args:
dataset: Pandas or Numpy dataset of numeric features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
# transform
new_arr = self.transformer.transform(dataset.data)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(new_arr, self.features, NumericRole(np.float32))
return output