Source code for lightautoml.dataset.utils

"""Utilities for working with the structure of a dataset."""

from typing import Callable
from typing import Dict
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union

from lightautoml.dataset.base import LAMLDataset
from lightautoml.dataset.np_pd_dataset import CSRSparseDataset
from lightautoml.dataset.np_pd_dataset import NumpyDataset
from lightautoml.dataset.np_pd_dataset import PandasDataset
from lightautoml.dataset.roles import ColumnRole


# RoleType = TypeVar("RoleType", bound=ColumnRole)


[docs]def roles_parser(init_roles: Dict[Union[ColumnRole, str], Union[str, Sequence[str]]]) -> Dict[str, ColumnRole]: """Parser of roles. Parse roles from old format numeric: ``[var1, var2, ...]`` to ``{var1:numeric, var2:numeric, ...}``. Args: init_roles: Mapping between roles and feature names. Returns: Roles dict in format key - feature names, value - roles. """ roles = {} for r in init_roles: feat = init_roles[r] if isinstance(feat, str): roles[feat] = r else: for f in init_roles[r]: roles[f] = r return roles
[docs]def get_common_concat( datasets: Sequence[LAMLDataset], ) -> Tuple[Callable, Optional[type]]: """Get concatenation function for datasets of different types. Takes multiple datasets as input and check, if is's ok to concatenate it and return function. Args: datasets: Sequence of datasets. Returns: Function, that is able to concatenate datasets. """ # TODO: Add pandas + numpy via transforming to numpy? dataset_types = set([type(x) for x in datasets]) # general - if single type, concatenation for that type if len(dataset_types) == 1: klass = list(dataset_types)[0] return klass.concat, None # np and sparse goes to sparse elif dataset_types == {NumpyDataset, CSRSparseDataset}: return CSRSparseDataset.concat, CSRSparseDataset elif dataset_types == {NumpyDataset, PandasDataset}: return numpy_and_pandas_concat, None raise TypeError("Unable to concatenate dataset types {0}".format(list(dataset_types)))
[docs]def numpy_and_pandas_concat(datasets: Sequence[Union[NumpyDataset, PandasDataset]]) -> PandasDataset: """Concat of numpy and pandas dataset. Args: datasets: Sequence of datasets to concatenate. Returns: Concatenated dataset. """ datasets = [x.to_pandas() for x in datasets] return PandasDataset.concat(datasets)
[docs]def concatenate(datasets: Sequence[LAMLDataset]) -> LAMLDataset: """Dataset concatenation function. Check if datasets have common concat function and then apply. Assume to take target/folds/weights etc from first one. Args: datasets: Sequence of datasets. Returns: Dataset with concatenated features. """ conc, klass = get_common_concat([ds for ds in datasets if ds is not None]) # this part is made to avoid setting first dataset of required type if klass is not None: n = 0 for n, ds in enumerate(datasets): if type(ds) is klass: break datasets = [datasets[n]] + [x for (y, x) in enumerate(datasets) if n != y] return conc(datasets)