Source code for lightautoml.dataset.np_pd_dataset

"""Internal representation of dataset in numpy, pandas and csr formats."""

from copy import copy  # , deepcopy
from typing import Any
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import TypeVar
from typing import Union

import numpy as np
import pandas as pd

from pandas import DataFrame
from pandas import Series
from scipy import sparse

from ..tasks.base import Task
from .base import IntIdx
from .base import LAMLDataset
from .base import RolesDict
from .base import array_attr_roles
from .base import valid_array_attributes
from .roles import ColumnRole
from .roles import DropRole
from .roles import NumericRole


# disable warnings later
# pd.set_option('mode.chained_assignment', None)

NpFeatures = Union[Sequence[str], str, None]
NpRoles = Union[Sequence[ColumnRole], ColumnRole, RolesDict, None]
DenseSparseArray = Union[np.ndarray, sparse.csr_matrix]
FrameOrSeries = Union[DataFrame, Series]
Dataset = TypeVar("Dataset", bound=LAMLDataset)


# possible checks list
# valid shapes
# target var is ok for task
# pandas - roles for all columns are defined
# numpy - roles and features are ok each other
# numpy - roles and features are ok for data
# features names does not contain __ - it's used to split processing names

# sparse - do not replace init and set data, but move type assert in checks?


[docs]class NumpyDataset(LAMLDataset): """Dataset that contains info in np.ndarray format. Create dataset from numpy arrays. Args: data: 2d array of features. features: Features names. roles: Roles specifier. task: Task specifier. **kwargs: Named attributes like target, group etc .. Note: For different type of parameter feature there is different behavior: - list, should be same len as data.shape[1] - None - automatic set names like feat_0, feat_1 ... - Prefix - automatic set names like Prefix_0, Prefix_1 ... For different type of parameter feature there is different behavior: - list, should be same len as data.shape[1]. - None - automatic set NumericRole(np.float32). - ColumnRole - single role. - dict. """ # TODO: Checks here _init_checks = () _data_checks = () _concat_checks = () _dataset_type = "NumpyDataset" @property def features(self) -> List[str]: """Features list.""" return list(self._features) @features.setter def features(self, val: Union[Sequence[str], str, None]): """Define how to set features. Args: val: Values of features. Note: There is different behavior for different type of val parameter: - list - should be same len as ``data.shape[1]`` - None - automatic set names like `feat_0`, `feat_1` ... - `'Prefix'` - automatic set names to `Prefix_0`, `Prefix_1` ... """ if type(val) is list: self._features = copy(val) else: prefix = val if val is not None else "feat" self._features = ["{0}_{1}".format(prefix, x) for x in range(self.data.shape[1])] @property def roles(self) -> RolesDict: """Roles dict.""" return copy(self._roles) @roles.setter def roles(self, val: NpRoles): """Define how to set roles. Args: val: Roles. Note: There is different behavior for different type of val parameter: - `List` - should be same len as ``data.shape[1]``. - `None` - automatic set ``NumericRole(np.float32)``. - ``ColumnRole`` - single role for all. - ``dict``. """ if type(val) is dict: self._roles = dict(((x, val[x]) for x in self.features)) elif type(val) is list: self._roles = dict(zip(self.features, val)) else: role = NumericRole(np.float32) if val is None else val self._roles = dict(((x, role) for x in self.features)) def _check_dtype(self): """Check if dtype in ``.set_data`` is ok and cast if not. Raises: AttributeError: If there is non-numeric type in dataset. """ # dtypes = list(set(map(lambda x: x.dtype, self.roles.values()))) dtypes = list(set([i.dtype for i in self.roles.values()])) self.dtype = np.find_common_type(dtypes, []) for f in self.roles: self._roles[f].dtype = self.dtype assert np.issubdtype(self.dtype, np.number), "Support only numeric types in numpy dataset." if self.data.dtype != self.dtype: try: self.data = self.data.astype(self.dtype) except: pass def __init__( self, data: Optional[DenseSparseArray], features: NpFeatures = (), roles: NpRoles = None, task: Optional[Task] = None, **kwargs: np.ndarray ): self._initialize(task, **kwargs) if data is not None: self.set_data(data, features, roles)
[docs] def set_data(self, data: DenseSparseArray, features: NpFeatures = (), roles: NpRoles = None): """Inplace set data, features, roles for empty dataset. Args: data: 2d np.ndarray of features. features: features names. roles: Roles specifier. Note: For different type of parameter feature there is different behavior: - List, should be same len as data.shape[1] - None - automatic set names like feat_0, feat_1 ... - Prefix - automatic set names like Prefix_0, Prefix_1 ... For different type of parameter feature there is different behavior: - List, should be same len as data.shape[1]. - None - automatic set NumericRole(np.float32). - ColumnRole - single role. - dict. """ assert data is None or type(data) is np.ndarray, "Numpy dataset support only np.ndarray features" super().set_data(data, features, roles) self._check_dtype()
@staticmethod def _hstack(datasets: Sequence[np.ndarray]) -> np.ndarray: """Concatenate function for numpy arrays. Args: datasets: Sequence of np.ndarray. Returns: Stacked features array. """ return np.hstack(datasets) @staticmethod def _get_rows(data: np.ndarray, k: IntIdx) -> np.ndarray: """Get rows slice for numpy ndarray. Args: data: Data. k: Sequence of integer indexes. Returns: Rows slice. """ return data[k] @staticmethod def _get_cols(data: np.ndarray, k: IntIdx) -> np.ndarray: """Get cols slice. Args: data: Data. k: Sequence of integer indexes. Returns: Cols slice. """ return data[:, k] @classmethod def _get_2d(cls, data: np.ndarray, k: Tuple[IntIdx, IntIdx]) -> np.ndarray: """Get 2d slice. Args: data: Data. k: Tuple of integer sequences. Returns: 2d slice. """ rows, cols = k return data[rows, cols] @staticmethod def _set_col(data: np.ndarray, k: int, val: np.ndarray): """Inplace set columns. Args: data: Data. k: Index of column. val: Values to set. """ data[:, k] = val
[docs] def to_numpy(self) -> "NumpyDataset": """Empty method to convert to numpy. Returns: Same NumpyDataset. """ return self
[docs] def to_csr(self) -> "CSRSparseDataset": """Convert to csr. Returns: Same dataset in CSRSparseDatatset format. """ assert all( [self.roles[x].name == "Numeric" for x in self.features] ), "Only numeric data accepted in sparse dataset" data = None if self.data is None else sparse.csr_matrix(self.data) roles = self.roles features = self.features # target and etc .. params = dict(((x, self.__dict__[x]) for x in self._array_like_attrs)) task = self.task return CSRSparseDataset(data, features, roles, task, **params)
[docs] def to_pandas(self) -> "PandasDataset": """Convert to PandasDataset. Returns: Same dataset in PandasDataset format. """ # check for empty case data = None if self.data is None else DataFrame(self.data, columns=self.features) roles = self.roles # target and etc .. params = dict( ( (x, Series(self.__dict__[x]) if len(self.__dict__[x].shape) == 1 else DataFrame(self.__dict__[x])) for x in self._array_like_attrs ) ) task = self.task return PandasDataset(data, roles, task, **params)
[docs] @staticmethod def from_dataset(dataset: Dataset) -> "NumpyDataset": """Convert random dataset to numpy. Args: dataset: Dataset. Returns: Numpy dataset. """ return dataset.to_numpy()
[docs]class CSRSparseDataset(NumpyDataset): """Dataset that contains sparse features and np.ndarray targets.""" _init_checks = () _data_checks = () _concat_checks = () _dataset_type = "CSRSparseDataset" @staticmethod def _get_cols(data: Any, k: Any): """Not implemented.""" raise NotImplementedError @staticmethod def _set_col(data: Any, k: Any, val: Any): """Not implemented.""" raise NotImplementedError
[docs] def to_pandas(self) -> Any: """Not implemented.""" raise NotImplementedError
[docs] def to_numpy(self) -> "NumpyDataset": """Convert to NumpyDataset. Returns: NumpyDataset. """ # check for empty data = None if self.data is None else self.data.toarray() roles = self.roles features = self.features # target and etc .. params = dict(((x, self.__dict__[x]) for x in self._array_like_attrs)) task = self.task return NumpyDataset(data, features, roles, task, **params)
@property def shape(self) -> Tuple[Optional[int], Optional[int]]: """Get size of 2d feature matrix. Returns: tuple of 2 elements. """ rows, cols = None, None try: rows, cols = self.data.shape except TypeError: if len(self._array_like_attrs) > 0: rows = len(self.__dict__[self._array_like_attrs[0]]) return rows, cols @staticmethod def _hstack(datasets: Sequence[Union[sparse.csr_matrix, np.ndarray]]) -> sparse.csr_matrix: """Concatenate function for sparse and numpy arrays. Args: datasets: Sequence of csr_matrix or np.ndarray. Returns: Sparse matrix. """ return sparse.hstack(datasets, format="csr")
[docs] def __init__( self, data: Optional[DenseSparseArray], features: NpFeatures = (), roles: NpRoles = None, task: Optional[Task] = None, **kwargs: np.ndarray ): """Create dataset from csr_matrix. Args: data: csr_matrix of features. features: Features names. roles: Roles specifier. task: Task specifier. **kwargs: Named attributes like target, group etc .. Note: For different type of parameter feature there is different behavior: - list, should be same len as data.shape[1] - None - automatic set names like feat_0, feat_1 ... - Prefix - automatic set names like Prefix_0, Prefix_1 ... For different type of parameter feature there is different behavior: - list, should be same len as data.shape[1]. - None - automatic set NumericRole(np.float32). - ColumnRole - single role. - dict. """ self._initialize(task, **kwargs) if data is not None: self.set_data(data, features, roles)
[docs] def set_data(self, data: DenseSparseArray, features: NpFeatures = (), roles: NpRoles = None): """Inplace set data, features, roles for empty dataset. Args: data: csr_matrix of features. features: features names. roles: Roles specifier. Note: For different type of parameter feature there is different behavior: - list, should be same len as data.shape[1] - None - automatic set names like feat_0, feat_1 ... - Prefix - automatic set names like Prefix_0, Prefix_1 ... For different type of parameter feature there is different behavior: - list, should be same len as data.shape[1]. - None - automatic set NumericRole(np.float32). - ColumnRole - single role. - dict. """ assert data is None or type(data) is sparse.csr_matrix, "CSRSparseDataset support only csr_matrix features" LAMLDataset.set_data(self, data, features, roles) self._check_dtype()
[docs] @staticmethod def from_dataset(dataset: Dataset) -> "CSRSparseDataset": """Convert dataset to sparse dataset. Args: dataset: Dataset. Returns: Dataset in sparse form. """ return dataset.to_csr()
[docs]class PandasDataset(LAMLDataset): """Dataset that contains `pd.DataFrame` features and `pd.Series` targets. Args: data: Table with features. features: features names. roles: Roles specifier. task: Task specifier. **kwargs: Series, array like attrs target, group etc... """ _init_checks = () _data_checks = () _concat_checks = () _dataset_type = "PandasDataset" @property def features(self) -> List[str]: """Get list of features. Returns: list of features. """ return [] if self.data is None else list(self.data.columns) @features.setter def features(self, val: None): """Ignore setting features. Args: val: ignored. """ pass def __init__( self, data: Optional[DataFrame] = None, roles: Optional[RolesDict] = None, task: Optional[Task] = None, **kwargs: Series ): if roles is None: roles = {} # parse parameters # check if target, group etc .. defined in roles for f in roles: for k, r in zip(valid_array_attributes, array_attr_roles): if roles[f].name == r: kwargs[k] = data[f].reset_index(drop=True) roles[f] = DropRole() self._initialize(task, **kwargs) if data is not None: self.set_data(data, None, roles) def _get_cols_idx(self, columns: Union[Sequence[str], str]) -> Union[Sequence[int], int]: """Get numeric index of columns by column names. Args: columns: sequence of columns of single column. Returns: sequence of int indexes or single int. """ if isinstance(columns, str): idx = self.data.columns.get_loc(columns) else: idx = self.data.columns.get_indexer(columns) return idx
[docs] def set_data(self, data: DataFrame, features: None, roles: RolesDict): """Inplace set data, features, roles for empty dataset. Args: data: Table with features. features: `None`, just for same interface. roles: Dict with roles. """ super().set_data(data, features, roles) self._check_dtype()
def _check_dtype(self): """Check if dtype in .set_data is ok and cast if not.""" date_columns = [] self.dtypes = {} for f in self.roles: if self.roles[f].name == "Datetime": date_columns.append(f) else: self.dtypes[f] = self.roles[f].dtype self.data = self.data.astype(self.dtypes) self.data.reset_index(drop=True, inplace=True) # do we need to reset_index ?? If yes - drop for Series attrs too # case to check - concat pandas dataset and from numpy to pandas dataset # TODO: Think about reset_index here # self.data.reset_index(inplace=True, drop=True) # handle dates types for i in date_columns: dt_role = self.roles[i] if not (self.data.dtypes[i] is np.datetime64): self.data[i] = pd.to_datetime( self.data[i], format=dt_role.format, unit=dt_role.unit, origin=dt_role.origin, cache=True, ) self.dtypes[i] = np.datetime64 @staticmethod def _hstack(datasets: Sequence[DataFrame]) -> DataFrame: """Define how to concat features arrays. Args: datasets: Sequence of tables. Returns: concatenated table. """ return pd.concat(datasets, axis=1) @staticmethod def _get_rows(data: DataFrame, k: IntIdx) -> FrameOrSeries: """Define how to get rows slice. Args: data: Table with data. k: Sequence of `int` indexes or `int`. Returns: Sliced rows. """ return data.iloc[k] @staticmethod def _get_cols(data: DataFrame, k: IntIdx) -> FrameOrSeries: """Define how to get cols slice. Args: data: Table with data. k: Sequence of `int` indexes or `int`. Returns: Sliced cols. """ return data.iloc[:, k] @classmethod def _get_2d(cls, data: DataFrame, k: Tuple[IntIdx, IntIdx]) -> FrameOrSeries: """Define 2d slice of table. Args: data: Table with data. k: Sequence of `int` indexes or `int`. Returns: 2d sliced table. """ rows, cols = k return data.iloc[rows, cols] @staticmethod def _set_col(data: DataFrame, k: int, val: Union[Series, np.ndarray]): """Inplace set column value to `pd.DataFrame`. Args: data: Table with data. k: Column index. val: Values to set. """ data.iloc[:, k] = val
[docs] def to_numpy(self) -> "NumpyDataset": """Convert to class:`NumpyDataset`. Returns: Same dataset in class:`NumpyDataset` format. """ # check for empty data = None if self.data is None else self.data.values roles = self.roles features = self.features # target and etc .. params = dict(((x, self.__dict__[x].values) for x in self._array_like_attrs)) task = self.task return NumpyDataset(data, features, roles, task, **params)
[docs] def to_pandas(self) -> "PandasDataset": """Empty method, return the same object. Returns: Self. """ return self
[docs] @staticmethod def from_dataset(dataset: Dataset) -> "PandasDataset": """Convert random dataset to pandas dataset. Args: dataset: Dataset. Returns: Converted to pandas dataset. """ return dataset.to_pandas()
[docs] def nan_rate(self): """Counts overall number of nans in dataset. Returns: Number of nans. """ return (len(self.data) - self.data.count()).sum()