"""Role contains information about the column, which determines how it is processed."""
from datetime import datetime
from typing import Any
from typing import Callable
from typing import Optional
from typing import Sequence
from typing import Union
import numpy as np
Dtype = Union[Callable, type, str]
# valid_features_str_names = []
[docs]class ColumnRole:
"""Abstract class for column role.
Role type defines column dtype,
place of column in dataset and transformers
and set additional attributes which impacts
on the way how it's handled.
"""
dtype = object
force_input = False
_name = "Abstract"
@property
def name(self) -> str:
"""Get str role name.
Returns:
str role name.
"""
return self._name
def __repr__(self) -> str:
"""String view of role.
Returns:
Representation string.
"""
params = [(x, self.__dict__[x]) for x in self.__dict__ if x not in ["dtype", "name"]]
return "{0} role, dtype {1}. Additional params: {2}".format(self.name, self.dtype, params)
def __hash__(self) -> int:
"""Define how to hash - hash from str view.
Returns:
Hashed name of column.
"""
return hash(self.__repr__())
def __eq__(self, other: Any) -> bool:
"""Define how to compare - if reprs are equal (hashed).
Args:
other: Another :class:`~lightautoml.dataset.roles.ColumnRole`.
Returns:
``True`` if equal.
"""
return self.__repr__() == other.__repr__()
[docs] @staticmethod
def from_string(name: str, **kwargs: Any) -> "ColumnRole":
"""Create default params role from string.
Args:
name: Role name.
kwargs: Other parameters.
Returns:
Corresponding role object.
"""
name = name.lower()
if name in ["target"]:
return TargetRole(**kwargs)
if name in ["numeric"]:
return NumericRole(**kwargs)
if name in ["category"]:
return CategoryRole(**kwargs)
if name in ["text"]:
return TextRole(**kwargs)
if name in ["datetime"]:
return DatetimeRole(**kwargs)
if name in ["base_date"]:
kwargs = {**{"seasonality": (), "base_date": True}, **kwargs}
return DatetimeRole(**kwargs)
if name in ["date"]:
kwargs = {**{"seasonality": (), "base_date": True}, **kwargs}
return DateRole(**kwargs)
if name in ["group"]:
return GroupRole()
if name in ["drop"]:
return DropRole()
if name in ["id"]:
# kwargs = {**{"encoding_type": "oof", "unknown": 1}, **kwargs}
return IdRole()
if name in ["folds"]:
return FoldsRole()
if name in ["weights"]:
return WeightsRole()
if name in ["path"]:
return PathRole()
if name in ["treatment"]:
return TreatmentRole()
raise ValueError("Unknown string role: {}".format(name))
[docs]class NumericRole(ColumnRole):
"""Numeric role.
Args:
dtype: Variable type.
force_input: Select a feature for training,
regardless of the selector results.
prob: If input number is probability.
discretization: Flag of discretization.
"""
_name = "Numeric"
def __init__(
self,
dtype: Dtype = np.float32,
force_input: bool = False,
prob: bool = False,
discretization: bool = False,
):
self.dtype = dtype
self.force_input = force_input
self.prob = prob
self.discretization = discretization
[docs]class CategoryRole(ColumnRole):
"""Category role.
Args:
dtype: Variable type.
encoding_type: Encoding type.
unknown: Cut-off freq to process rare categories as unseen.
force_input: Select a feature for training,
regardless of the selector results.
ordinal: Ordinal category.
Note:
Valid encoding_type:
- `'auto'` - default processing
- `'int'` - encode with int
- `'oof'` - out-of-fold target encoding
- `'freq'` - frequency encoding
- `'ohe'` - one hot encoding
"""
_name = "Category"
def __init__(
self,
dtype: Dtype = object,
encoding_type: str = "auto",
unknown: int = 5,
force_input: bool = False,
label_encoded: bool = False,
ordinal: bool = False,
):
# TODO: assert dtype is object, 'Dtype for category should be defined' ?
# assert encoding_type == 'auto', 'For the moment only auto is supported'
# TODO: support all encodings
self.dtype = dtype
self.encoding_type = encoding_type
self.unknown = unknown
self.force_input = force_input
self.label_encoded = label_encoded
self.ordinal = ordinal
[docs]class TextRole(ColumnRole):
"""Text role.
Args:
dtype: Variable type.
force_input: Select a feature for training,
regardless of the selector results.
"""
_name = "Text"
def __init__(self, dtype: Dtype = str, force_input: bool = True):
self.dtype = dtype
self.force_input = force_input
[docs]class DatetimeRole(ColumnRole):
"""Datetime role.
Args:
dtype: Variable type.
seasonality: Seasons to extract from date.
Valid are: 'y', 'm', 'd', 'wd', 'hour', 'min', 'sec', 'ms', 'ns'.
base_date: Base date is used to calculate difference
with other dates, like `age = report_dt - birth_dt`.
date_format: Format to parse date.
unit: The unit of the arg denote the unit, pandas like, see more:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html.
origin: Define the reference date, pandas like, see more:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html.
force_input: Select a feature for training,
regardless of the selector results.
base_feats: To calculate feats on base date.
country: Datetime metadata to extract holidays.
prov: Datetime metadata to extract holidays.
state: Datetime metadata to extract holidays.
"""
_name = "Datetime"
def __init__(
self,
dtype: Dtype = np.datetime64,
seasonality: Optional[Sequence[str]] = ("y", "m", "wd"),
base_date: bool = False,
date_format: Optional[str] = None,
unit: Optional[str] = None,
origin: Union[str, datetime] = "unix",
force_input: bool = False,
base_feats: bool = True,
country: Optional[str] = None,
prov: Optional[str] = None,
state: Optional[str] = None,
):
self.dtype = dtype
self.seasonality = []
if seasonality is not None:
self.seasonality = seasonality
self.base_date = base_date
self.format = date_format
self.unit = unit
self.origin = origin
self.force_input = force_input
if self.base_date:
self.force_input = True
self.base_feats = base_feats
self.country = country
self.prov = prov
self.state = state
# class MixedRole(ColumnRole):
# """
# Mixed role. If exact role extraction is difficult, it goes into both pipelines
# """
[docs]class TargetRole(ColumnRole):
"""Target role.
Args:
dtype: Dtype of target.
"""
_name = "Target"
def __init__(self, dtype: Dtype = np.float32):
self.dtype = dtype
[docs]class GroupRole(ColumnRole):
"""Group role."""
_name = "Group"
[docs]class DropRole(ColumnRole):
"""Drop role."""
_name = "Drop"
[docs]class WeightsRole(ColumnRole):
"""Weights role."""
_name = "Weights"
[docs]class FoldsRole(ColumnRole):
"""Folds role."""
_name = "Folds"
[docs]class PathRole(ColumnRole):
"""Path role."""
_name = "Path"
class DateRole(DatetimeRole):
"""Date role."""
_name = "Date"
class IdRole(CategoryRole):
"""Id role."""
_name = "Id"
class TreatmentRole(ColumnRole):
"""Uplift Treatment Role."""
_name = "Treatment"