Source code for lightautoml.dataset.base

"""Contains base classes for internal dataset interface."""

from copy import copy  # , deepcopy
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union

from ..tasks.base import Task
from .roles import ColumnRole


valid_array_attributes = ("target", "group", "folds", "weights", "date", "id", "treatment")
array_attr_roles = ("Target", "Group", "Folds", "Weights", "Date", "Id", "Treatment")
# valid_tasks = ('reg', 'binary', 'multiclass') # TODO: Add multiclass and multilabel. Refactor for some dataset and pipes needed
# valid_tasks = ('reg', 'binary')


RolesDict = Dict[str, ColumnRole]
IntIdx = Union[Sequence[int], int]
RowSlice = Optional[Union[Sequence[int], Sequence[bool]]]
ColSlice = Optional[Union[Sequence[str], str]]


[docs]class LAMLColumn: """Basic class for pair - column, role."""
[docs] def __init__(self, data: Any, role: ColumnRole): """Set a pair column/role. Args: data: 1d array like. role: Column role. """ self.data = data self.role = role
def __repr__(self) -> str: """Repr method. Returns: String with data representation. """ return self.data.__repr__()
[docs]class LAMLDataset: """Basic class to create dataset.""" # TODO: Create checks here _init_checks = () # list of functions that checks that _array_like_attrs are valid _data_checks = () # list of functions that checks that data in .set_data is valid for _array_like_attrs _concat_checks = () # list of functions that checks that datasets for concatenation are valid _dataset_type = "LAMLDataset"
[docs] def __init__( self, data: Any, features: Optional[list], roles: Optional[RolesDict], task: Optional[Task] = None, **kwargs: Any ): """Create dataset with given data, features, roles and special attributes. Args: data: 2d array of data of special type for each dataset type. features: Feature names or None for empty data. roles: Features roles or None for empty data. task: Task for dataset if train/valid. **kwargs: Special named array of attributes (target, group etc..). """ if features is None: features = [] if roles is None: roles = {} self._initialize(task, **kwargs) if data is not None: self.set_data(data, features, roles)
def __len__(self): """Get count of rows in dataset. Returns: Number of rows in dataset. """ return self.shape[0] def __repr__(self): """Get str representation. Returns: String with data representation. """ # TODO: View for empty return self.data.__repr__() # default behavior and abstract methods def __getitem__(self, k: Tuple[RowSlice, ColSlice]) -> Union["LAMLDataset", LAMLColumn]: """Select a subset of dataset. Define how to slice a dataset in way ``dataset[[1, 2, 3...], ['feat_0', 'feat_1'...]]``. Default behavior based on ``._get_cols``, ``._get_rows``, ``._get_2d``. Args: k: First element optional integer columns indexes, second - optional feature name or list of features names. Returns: Subdataset. """ # TODO: Maybe refactor this part? if type(k) is tuple: rows, cols = k else: rows = k cols = None # case when columns are defined if cols is not None: idx = self._get_cols_idx(cols) data = self._get_2d(self.data, (rows, idx)) # case of single column - return LAMLColumn if isinstance(cols, str): dataset = LAMLColumn(self._get_2d(self.data, (rows, idx)), role=self.roles[cols]) return dataset # case of multiple columns - return LAMLDataset roles = dict(((x, self.roles[x]) for x in self.roles if x in cols)) features = [x for x in cols if x in set(self.features)] else: data, roles, features = self.data, self.roles, self.features # case when rows are defined if rows is None: dataset = self.empty() else: dataset = copy(self) params = dict(((x, self._get_rows(self.__dict__[x], rows)) for x in self._array_like_attrs)) dataset._initialize(self.task, **params) data = self._get_rows(data, rows) dataset.set_data(data, features, roles) return dataset def __setitem__(self, k: str, val: Any): """Inplace set values for single column (in default implementation). Args: k: Feature name. val: :class:`~lightautoml.dataset.base.LAMLColumn` or 1d array like. """ assert k in self.features, "Can only replace existed columns in default implementations." idx = self._get_cols_idx(k) # for case when setting col and change role if type(val) is LAMLColumn: assert val.role.dtype == self.roles[k].dtype, "Inplace changing types unavaliable." self._set_col(self.data, idx, val.data) self.roles[k] = val.role # for case only changing column values else: self._set_col(self.data, idx, val) def __getattr__(self, item: str) -> Any: """Get item for key features as target/folds/weights etc. Args: item: Attribute name. Returns: Attribute value. """ if item in valid_array_attributes: return None raise AttributeError @property def features(self) -> list: """Define how to get features names list. Returns: Features names. """ return list(self._features) @features.setter def features(self, val: list): """Define how to set features list. Args: val: Features names. """ self._features = copy(val) @property def data(self) -> Any: """Get data attribute. Returns: Any, array like or ``None``. """ return self._data @data.setter def data(self, val: Any): """Set data array or ``None``. Args: val: Some data or ``None``. """ self._data = val @property def roles(self) -> RolesDict: """Get roles dict. Returns: Dict of feature roles. """ return copy(self._roles) @roles.setter def roles(self, val: RolesDict): """Set roles dict. Args: val: Roles dict. """ self._roles = dict(((x, val[x]) for x in self.features)) @property def inverse_roles(self) -> Dict[ColumnRole, List[str]]: """Get inverse dict of feature roles. Returns: dict, keys - roles, values - features names. """ inv_roles = {} roles = self.roles for k in roles: r = roles[k] if r in inv_roles: inv_roles[r].append(k) else: inv_roles[r] = [k] return inv_roles def _initialize(self, task: Optional[Task], **kwargs: Any): """Initialize empty dataset with task and array like attributes. Args: task: Task name for dataset. **kwargs: 1d arrays like attrs like target, group etc. """ assert all([x in valid_array_attributes for x in kwargs]), "Unknown array attribute. Valid are {0}".format( valid_array_attributes ) self.task = task # here we set target and group and so ... self._array_like_attrs = [] for k in kwargs: self._array_like_attrs.append(k) self.__dict__[k] = kwargs[k] # checks for valid values in target, groups ... for check in self._init_checks: check(self) # set empty attributes self._data = None self._features = [] self._roles = {}
[docs] def set_data(self, data: Any, features: Any, roles: Any): """Inplace set data, features, roles for empty dataset. Args: data: 2d array like or ``None``. features: List of features names. roles: Roles dict. """ self.data = data self.features = features self.roles = roles # data checks for check in self._data_checks: check(self)
[docs] def empty(self) -> "LAMLDataset": """Get new dataset for same task and targets, groups, without features. Returns: New empty dataset. """ dataset = copy(self) params = dict(((x, self.__dict__[x]) for x in self._array_like_attrs)) dataset._initialize(self.task, **params) return dataset
def _get_cols_idx(self, columns: Sequence) -> Union[List[int], int]: """Get numeric index of columns by column names. Args: columns: Features names. Returns: List of integer indexes of single int. """ if isinstance(columns, str): idx = self.features.index(columns) else: idx = [self.features.index(x) for x in columns] return idx # default calculated properties @property def shape(self) -> Tuple[Optional[int], Optional[int]]: """Get size of 2d feature matrix. Returns: Tuple of 2 elements. """ rows, cols = None, None try: rows, cols = len(self.data), len(self.features) except TypeError: if len(self._array_like_attrs) > 0: rows = len(self.__dict__[self._array_like_attrs[0]]) return rows, cols # static methods - how to make 1d slice, 2s slice, concat of feature matrix etc ... @staticmethod def _hstack(datasets: Sequence[Any]) -> Any: """Abstract method - define horizontal stack of feature arrays. Args: datasets: Sequence of feature arrays. Returns: # noqa DAR202 Single feature array. """ raise NotImplementedError("Horizontal Stack not implemented.") @staticmethod def _get_rows(data, k: IntIdx) -> Any: """Abstract - define how to make rows slice of feature array. Args: data: 2d feature array. k: Sequence of int indexes or int. Returns: # noqa DAR202 2d feature array. """ raise NotImplementedError("Row Slice not Implemented.") @staticmethod def _get_cols(data, k: IntIdx) -> Any: """Abstract - define how to make columns slice of feature array. Args: data: 2d feature array. k: Sequence indexes or single index. Returns: # noqa DAR202 2d feature array. """ raise NotImplementedError("Column Slice not Implemented.") # TODO: remove classmethod here ? @classmethod def _get_2d(cls, data: Any, k: Tuple[IntIdx, IntIdx]) -> Any: """Default implementation of 2d slice based on rows slice and columns slice. Args: data: 2d feature array. k: Tuple of integer sequences or 2 int. Returns: 2d feature array. """ rows, cols = k return cls._get_rows(cls._get_cols(data, cols), rows) @staticmethod def _set_col(data: Any, k: int, val: Any): """Abstract - set a value of single column by column name inplace. Args: data: 2d feature array. k: Column idx. val: 1d column value. """ raise NotImplementedError("Column setting inplace not implemented.")
[docs] @classmethod def concat(cls, datasets: Sequence["LAMLDataset"]) -> "LAMLDataset": """Concat multiple dataset. Default behavior - takes empty dataset from datasets[0] and concat all features from others. Args: datasets: Sequence of datasets. Returns: Concated dataset. """ for check in cls._concat_checks: check(datasets) dataset = datasets[0].empty() data = [] features = [] roles = {} atrs = set(dataset._array_like_attrs) for ds in datasets: data.append(ds.data) features.extend(ds.features) roles = {**roles, **ds.roles} for atr in ds._array_like_attrs: if atr not in atrs: dataset._array_like_attrs.append(atr) dataset.__dict__[atr] = ds.__dict__[atr] atrs.update({atr}) data = cls._hstack(data) dataset.set_data(data, features, roles) return dataset
[docs] def drop_features(self, droplist: Sequence[str]): """Inplace drop columns from dataset. Args: droplist: Feature names. Returns: Dataset without columns. """ if len(droplist) == 0: return self return self[:, [x for x in self.features if x not in droplist]]
[docs] @staticmethod def from_dataset(dataset: "LAMLDataset") -> "LAMLDataset": """Abstract method - how to create this type of dataset from others. Args: dataset: Original type dataset. Returns: # noqa DAR202 Converted type dataset. """ raise NotImplementedError
@property def dataset_type(self): """Get type of dataset.""" return self._dataset_type