Source code for lightautoml.transformers.datetime

"""Datetime features transformers."""

from collections import OrderedDict
from typing import List
from typing import Optional
from typing import Sequence

import holidays
import numpy as np

from ..dataset.base import LAMLDataset
from ..dataset.np_pd_dataset import NumpyDataset
from ..dataset.np_pd_dataset import PandasDataset
from ..dataset.roles import CategoryRole
from ..dataset.roles import ColumnRole
from ..dataset.roles import NumericRole
from .base import LAMLTransformer


# type - dataset that is ok with datetime dtypes
DatetimeCompatible = PandasDataset

date_attrs = {
    "y": "year",
    "m": "month",
    "d": "day",
    "wd": "weekday",
    "doy": "dayofyear",
    "hour": "hour",
    "min": "minute",
    "sec": "second",
    "ms": "microsecond",
    "ns": "nanosecond",
}


def datetime_check(dataset: LAMLDataset):
    """Check if all passed vars are datetimes.

    Args:
        dataset: Dataset to check.

    Raises:
        AssertionError: If non-datetime features are present.

    """
    roles = dataset.roles
    features = dataset.features
    for f in features:
        assert roles[f].name == "Datetime", "Only datetimes accepted in this transformer"


[docs]class TimeToNum(LAMLTransformer): """Basic conversion strategy, used in selection one-to-one transformers. Datetime converted to difference with basic_date (``basic_date == '2020-01-01'``). """ basic_time = "2020-01-01" basic_interval = "D" _fname_prefix = "dtdiff" _fit_checks = (datetime_check,) _transform_checks = ()
[docs] def transform(self, dataset: DatetimeCompatible) -> NumpyDataset: """Transform dates to numeric differences with base date. Args: dataset: Numpy or Pandas dataset with datetime columns. Returns: Numpy dataset of numeric features. """ # checks if exist super().transform(dataset) # convert to accepted format and get attributes dataset = dataset.to_pandas() data = dataset.data # transform roles = NumericRole(np.float32) new_arr = ((data - np.datetime64(self.basic_time)) / np.timedelta64(1, self.basic_interval)).values.astype( np.float32 ) # create resulted output = dataset.empty().to_numpy() output.set_data(new_arr, self.features, roles) return output
[docs]class BaseDiff(LAMLTransformer): """Basic conversion strategy, used in selection one-to-one transformers. Datetime converted to difference with basic_date. Args: base_names: Base date names. diff_names: Difference date names. basic_interval: Time unit. """ basic_interval = "D" _fname_prefix = "basediff" _fit_checks = (datetime_check,) _transform_checks = () @property def features(self) -> List[str]: """List of features.""" return self._features def __init__( self, base_names: Sequence[str], diff_names: Sequence[str], basic_interval: Optional[str] = "D", ): self.base_names = base_names self.diff_names = diff_names self.basic_interval = basic_interval
[docs] def fit(self, dataset: LAMLDataset) -> "LAMLTransformer": """Fit transformer and return it's instance. Args: dataset: Dataset to fit on. Returns: self. """ self._features = [] for col in self.base_names: self._features.extend(["basediff_{0}__{1}".format(col, x) for x in self.diff_names]) for check_func in self._fit_checks: check_func(dataset) return self
[docs] def transform(self, dataset: DatetimeCompatible) -> NumpyDataset: """Transform dates to numeric differences with base date. Args: dataset: Numpy or Pandas dataset with datetime columns. Returns: NumpyDataset of numeric features. """ # checks if exist super().transform(dataset) # convert to accepted format and get attributes dataset = dataset.to_pandas() data = dataset.data[self.diff_names].values base_cols = dataset.data[self.base_names] feats_block = [] # transform for col in base_cols.columns: new_arr = ((data - base_cols[[col]].values) / np.timedelta64(1, self.basic_interval)).astype(np.float32) feats_block.append(new_arr) feats_block = np.concatenate(feats_block, axis=1) # create resulted output = dataset.empty().to_numpy() output.set_data(feats_block, self.features, NumericRole(dtype=np.float32)) return output
[docs]class DateSeasons(LAMLTransformer): """Basic conversion strategy, used in selection one-to-one transformers. Datetime converted to difference with basic_date. Args: output_role: Which role to assign for input features. """ _fname_prefix = "season" _fit_checks = (datetime_check,) _transform_checks = () @property def features(self) -> List[str]: """List of features names.""" return self._features def __init__(self, output_role: Optional[ColumnRole] = None): self.output_role = output_role if output_role is None: self.output_role = CategoryRole(np.int32)
[docs] def fit(self, dataset: LAMLDataset) -> "LAMLTransformer": """Fit transformer and return it's instance. Args: dataset: LAMLDataset to fit on. Returns: self. """ for check_func in self._fit_checks: check_func(dataset) feats = dataset.features roles = dataset.roles self._features = [] self.transformations = OrderedDict() for col in feats: seas = roles[col].seasonality self.transformations[col] = seas for s in seas: self._features.append("season_{0}__{1}".format(s, col)) if roles[col].country is not None: self._features.append("season_hol__{0}".format(col)) return self
[docs] def transform(self, dataset: DatetimeCompatible) -> NumpyDataset: """Transform dates to categories - seasons and holiday flag. Args: dataset: Numpy or Pandas dataset with datetime columns. Returns: Numpy dataset of numeric features. """ # checks if exist super().transform(dataset) # convert to accepted format and get attributes dataset = dataset.to_pandas() df = dataset.data roles = dataset.roles new_arr = np.empty((df.shape[0], len(self._features)), np.int32) n = 0 for col in dataset.features: for seas in self.transformations[col]: new_arr[:, n] = getattr(df[col].dt, date_attrs[seas]) n += 1 if roles[col].country is not None: # get years years = np.unique(df[col].dt.year) hol = holidays.CountryHoliday( roles[col].country, years=years, prov=roles[col].prov, state=roles[col].state, ) new_arr[:, n] = df[col].dt.date.isin(hol) n += 1 # create resulted output = dataset.empty().to_numpy() output.set_data(new_arr, self.features, self.output_role) return output