Source code for lightautoml.transformers.numeric
"""Numeric features transformers."""
from typing import Union
import numpy as np
from ..dataset.base import LAMLDataset
from ..dataset.np_pd_dataset import NumpyDataset
from ..dataset.np_pd_dataset import PandasDataset
from ..dataset.roles import CategoryRole
from ..dataset.roles import NumericRole
from .base import LAMLTransformer
# type - something that can be converted to pandas dataset
NumpyTransformable = Union[NumpyDataset, PandasDataset]
def numeric_check(dataset: LAMLDataset):
"""Check if all passed vars are categories.
Args:
dataset: Dataset to check.
Raises:
AssertionError: If there is non number role.
"""
roles = dataset.roles
features = dataset.features
for f in features:
assert roles[f].name == "Numeric", "Only numbers accepted in this transformer"
[docs]class NaNFlags(LAMLTransformer):
"""Create NaN flags."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "nanflg"
[docs] def __init__(self, nan_rate: float = 0.005):
"""
Args:
nan_rate: Nan rate cutoff.
"""
self.nan_rate = nan_rate
[docs] def fit(self, dataset: NumpyTransformable):
"""Extract nan flags.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
self.
"""
# set transformer names and add checks
for check_func in self._fit_checks:
check_func(dataset)
# set transformer features
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# fit ...
ds_nan_rate = np.isnan(data).mean(axis=0)
self.nan_cols = [name for (name, nan_rate) in zip(dataset.features, ds_nan_rate) if nan_rate > self.nan_rate]
self._features = list(self.nan_cols)
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Transform - extract null flags.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
nans = dataset[:, self.nan_cols].data
# transform
new_arr = np.isnan(nans).astype(np.float32)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(new_arr, self.features, NumericRole(np.float32))
return output
[docs]class FillnaMedian(LAMLTransformer):
"""Fillna with median."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "fillnamed"
[docs] def fit(self, dataset: NumpyTransformable):
"""Estimate medians.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
self.
"""
# set transformer names and add checks
super().fit(dataset)
# set transformer features
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
self.meds = np.nanmedian(data, axis=0)
self.meds[np.isnan(self.meds)] = 0
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Transform - fillna with medians.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
data = np.where(np.isnan(data), self.meds, data)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(data, self.features, NumericRole(np.float32))
return output
[docs]class FillInf(LAMLTransformer):
"""Fill inf with nan to handle as nan value."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "fillinf"
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Replace inf to nan.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
data = np.where(np.isinf(data), np.nan, data)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(data, self.features, NumericRole(np.float32))
return output
[docs]class LogOdds(LAMLTransformer):
"""Convert probs to logodds."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "logodds"
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Transform - convert num values to logodds.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
# TODO: maybe np.exp and then cliping and logodds?
data = np.clip(data, 1e-7, 1 - 1e-7)
data = np.log(data / (1 - data))
# create resulted
output = dataset.empty().to_numpy()
output.set_data(data, self.features, NumericRole(np.float32))
return output
[docs]class StandardScaler(LAMLTransformer):
"""Classic StandardScaler."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "scaler"
[docs] def fit(self, dataset: NumpyTransformable):
"""Estimate means and stds.
Args:
dataset: Pandas or Numpy dataset of categorical features.
Returns:
self.
"""
# set transformer names and add checks
super().fit(dataset)
# set transformer features
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
self.means = np.nanmean(data, axis=0)
self.stds = np.nanstd(data, axis=0)
# Fix zero stds to 1
self.stds[(self.stds == 0) | np.isnan(self.stds)] = 1
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Scale test data.
Args:
dataset: Pandas or Numpy dataset of numeric features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
data = (data - self.means) / self.stds
# create resulted
output = dataset.empty().to_numpy()
output.set_data(data, self.features, NumericRole(np.float32))
return output
[docs]class QuantileBinning(LAMLTransformer):
"""Discretization of numeric features by quantiles."""
_fit_checks = (numeric_check,)
_transform_checks = ()
_fname_prefix = "qntl"
[docs] def __init__(self, nbins: int = 10):
"""
Args:
nbins: maximum number of bins.
"""
self.nbins = nbins
[docs] def fit(self, dataset: NumpyTransformable):
"""Estimate bins borders.
Args:
dataset: Pandas or Numpy dataset of numeric features.
Returns:
self.
"""
# set transformer names and add checks
super().fit(dataset)
# set transformer features
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
sl = np.isnan(data)
grid = np.linspace(0, 1, self.nbins + 1)[1:-1]
self.bins = []
for n in range(data.shape[1]):
q = np.quantile(data[:, n][~sl[:, n]], q=grid)
q = np.unique(q)
self.bins.append(q)
return self
[docs] def transform(self, dataset: NumpyTransformable) -> NumpyDataset:
"""Apply bin borders.
Args:
dataset: Pandas or Numpy dataset of numeric features.
Returns:
Numpy dataset with encoded labels.
"""
# checks here
super().transform(dataset)
# convert to accepted dtype and get attributes
dataset = dataset.to_numpy()
data = dataset.data
# transform
sl = np.isnan(data)
new_data = np.zeros(data.shape, dtype=np.int32)
for n, b in enumerate(self.bins):
new_data[:, n] = np.searchsorted(b, np.where(sl[:, n], np.inf, data[:, n])) + 1
new_data = np.where(sl, 0, new_data)
# create resulted
output = dataset.empty().to_numpy()
output.set_data(new_data, self.features, CategoryRole(np.int32, label_encoded=True))
return output