Source code for lightautoml.dataset.roles

"""Role contains information about the column, which determines how it is processed."""

from datetime import datetime
from typing import Any
from typing import Callable
from typing import Optional
from typing import Sequence
from typing import Union

import numpy as np


Dtype = Union[Callable, type, str]


# valid_features_str_names = []



[docs]
class ColumnRole:
    """Abstract class for column role.

    Role type defines column dtype,
    place of column in dataset and transformers
    and set additional attributes which impacts
    on the way how it's handled.

    """

    dtype = object
    force_input = False
    _name = "Abstract"

    @property
    def name(self) -> str:
        """Get str role name.

        Returns:
            str role name.

        """
        return self._name

    def __repr__(self) -> str:
        """String view of role.

        Returns:
            Representation string.

        """
        params = [(x, self.__dict__[x]) for x in self.__dict__ if x not in ["dtype", "name"]]

        return "{0} role, dtype {1}. Additional params: {2}".format(self.name, self.dtype, params)

    def __hash__(self) -> int:
        """Define how to hash - hash from str view.

        Returns:
            Hashed name of column.

        """
        return hash(self.__repr__())

    def __eq__(self, other: Any) -> bool:
        """Define how to compare - if reprs are equal (hashed).

        Args:
            other: Another :class:`~lightautoml.dataset.roles.ColumnRole`.

        Returns:
            ``True`` if equal.

        """
        return self.__repr__() == other.__repr__()


[docs]
    @staticmethod
    def from_string(name: str, **kwargs: Any) -> "ColumnRole":
        """Create default params role from string.

        Args:
            name: Role name.
            kwargs: Other parameters.

        Returns:
            Corresponding role object.

        """
        name = name.lower()

        if name in ["target"]:
            return TargetRole(**kwargs)

        if name in ["numeric"]:
            return NumericRole(**kwargs)

        if name in ["category"]:
            return CategoryRole(**kwargs)

        if name in ["text"]:
            return TextRole(**kwargs)

        if name in ["datetime"]:
            return DatetimeRole(**kwargs)

        if name in ["base_date"]:
            kwargs = {**{"seasonality": (), "base_date": True}, **kwargs}
            return DatetimeRole(**kwargs)

        if name in ["date"]:
            kwargs = {**{"seasonality": (), "base_date": True}, **kwargs}
            return DateRole(**kwargs)

        if name in ["group"]:
            return GroupRole()

        if name in ["drop"]:
            return DropRole()

        if name in ["id"]:
            # kwargs = {**{"encoding_type": "oof", "unknown": 1}, **kwargs}
            return IdRole()

        if name in ["folds"]:
            return FoldsRole()

        if name in ["weights"]:
            return WeightsRole()

        if name in ["path"]:
            return PathRole()

        if name in ["treatment"]:
            return TreatmentRole()

        raise ValueError("Unknown string role: {}".format(name))





[docs]
class NumericRole(ColumnRole):
    """Numeric role.

    Args:
        dtype: Variable type.
        force_input: Select a feature for training,
            regardless of the selector results.
        prob: If input number is probability.
        discretization: Flag of discretization.

    """

    _name = "Numeric"

    def __init__(
        self,
        dtype: Dtype = np.float32,
        force_input: bool = False,
        prob: bool = False,
        discretization: bool = False,
    ):
        self.dtype = dtype
        self.force_input = force_input
        self.prob = prob
        self.discretization = discretization




[docs]
class CategoryRole(ColumnRole):
    """Category role.

    Args:
        dtype: Variable type.
        encoding_type: Encoding type.
        unknown: Cut-off freq to process rare categories as unseen.
        force_input: Select a feature for training,
            regardless of the selector results.
        ordinal: Ordinal category.

    Note:
        Valid encoding_type:

            - `'auto'` - default processing
            - `'int'` - encode with int
            - `'oof'` - out-of-fold target encoding
            - `'freq'` - frequency encoding
            - `'ohe'` - one hot encoding

    """

    _name = "Category"

    def __init__(
        self,
        dtype: Dtype = object,
        encoding_type: str = "auto",
        unknown: int = 5,
        force_input: bool = False,
        label_encoded: bool = False,
        ordinal: bool = False,
    ):
        # TODO: assert dtype is object, 'Dtype for category should be defined' ?
        # assert encoding_type == 'auto', 'For the moment only auto is supported'
        # TODO: support all encodings
        self.dtype = dtype
        self.encoding_type = encoding_type
        self.unknown = unknown
        self.force_input = force_input
        self.label_encoded = label_encoded
        self.ordinal = ordinal




[docs]
class TextRole(ColumnRole):
    """Text role.

    Args:
        dtype: Variable type.
        force_input: Select a feature for training,
            regardless of the selector results.

    """

    _name = "Text"

    def __init__(self, dtype: Dtype = str, force_input: bool = True):
        self.dtype = dtype
        self.force_input = force_input




[docs]
class DatetimeRole(ColumnRole):
    """Datetime role.

    Args:
        dtype: Variable type.
        seasonality: Seasons to extract from date.
            Valid are: 'y', 'm', 'd', 'wd', 'hour', 'min', 'sec', 'ms', 'ns'.
        base_date: Base date is used to calculate difference
            with other dates, like `age = report_dt - birth_dt`.
        date_format: Format to parse date.
        unit: The unit of the arg denote the unit, pandas like, see more:
            https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html.
        origin: Define the reference date, pandas like, see more:
            https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html.
        force_input: Select a feature for training,
            regardless of the selector results.
        base_feats: To calculate feats on base date.
        country: Datetime metadata to extract holidays.
        prov: Datetime metadata to extract holidays.
        state: Datetime metadata to extract holidays.

    """

    _name = "Datetime"

    def __init__(
        self,
        dtype: Dtype = np.datetime64,
        seasonality: Optional[Sequence[str]] = ("y", "m", "wd"),
        base_date: bool = False,
        date_format: Optional[str] = None,
        unit: Optional[str] = None,
        origin: Union[str, datetime] = "unix",
        force_input: bool = False,
        base_feats: bool = True,
        country: Optional[str] = None,
        prov: Optional[str] = None,
        state: Optional[str] = None,
    ):
        self.dtype = dtype
        self.seasonality = []
        if seasonality is not None:
            self.seasonality = seasonality
        self.base_date = base_date
        self.format = date_format
        self.unit = unit
        self.origin = origin

        self.force_input = force_input
        if self.base_date:
            self.force_input = True
        self.base_feats = base_feats

        self.country = country
        self.prov = prov
        self.state = state



# class MixedRole(ColumnRole):
#     """
#     Mixed role. If exact role extraction is difficult, it goes into both pipelines
#     """



[docs]
class TargetRole(ColumnRole):
    """Target role.

    Args:
        dtype: Dtype of target.

    """

    _name = "Target"

    def __init__(self, dtype: Dtype = np.float32):
        self.dtype = dtype




[docs]
class GroupRole(ColumnRole):
    """Group role."""

    _name = "Group"




[docs]
class DropRole(ColumnRole):
    """Drop role."""

    _name = "Drop"




[docs]
class WeightsRole(ColumnRole):
    """Weights role."""

    _name = "Weights"




[docs]
class FoldsRole(ColumnRole):
    """Folds role."""

    _name = "Folds"




[docs]
class PathRole(ColumnRole):
    """Path role."""

    _name = "Path"



class DateRole(DatetimeRole):
    """Date role."""

    _name = "Date"


class IdRole(CategoryRole):
    """Id role."""

    _name = "Id"


class TreatmentRole(ColumnRole):
    """Uplift Treatment Role."""

    _name = "Treatment"