Source code for lightautoml.pipelines.utils

"""Pipeline utils."""

from typing import Any
from typing import List
from typing import Optional
from typing import Sequence

from lightautoml.dataset.base import LAMLDataset


[docs]def map_pipeline_names(input_names: Sequence[str], output_names: Sequence[str]) -> List[Optional[str]]: """Pipelines create name in the way 'prefix__feature_name'. Multiple pipelines will create names in the way 'prefix1__prefix2__feature_name'. This function maps initial features names to outputs. Result may be not exact in some rare cases, but it's ok for real pipelines. Args: input_names: Initial feature names. output_names: Output feature names. Returns: Mapping between feature names. """ # TODO: Add assert here mapped: List[Optional[str]] = [None] * len(output_names) s_in = set(input_names) for n, name in enumerate(output_names): splitted = name.split("__") for i in range(len(splitted)): name = "__".join(splitted[i:]) if name in s_in: mapped[n] = name break assert None not in mapped, "Can not infer names. For feature selection purposes use simple pipeline (one-to-one)" return mapped
[docs]def get_columns_by_role(dataset: LAMLDataset, role_name: str, **kwargs: Any) -> List[str]: """Search for columns with specific role and attributes when building pipeline. Args: dataset: Dataset to search. role_name: Name of features role. **kwargs: Specific parameters values to search. Example: search for categories with OHE processing only. Returns: List of str features names. """ features = [] inv_roles = dataset.inverse_roles for role in inv_roles: if role.name == role_name: flg = True # TODO: maybe refactor for k in kwargs: try: attr = getattr(role, k) except AttributeError: flg = False break if attr != kwargs[k]: flg = False break if flg: features.extend(inv_roles[role]) return sorted(features)