Source code for autogluon.eda.analysis.base

from __future__ import annotations

import logging
from abc import ABC, abstractmethod
from typing import Dict, Generator, List, Optional, Tuple

from pandas import DataFrame

from ..state import AnalysisState, StateCheckMixin

logger = logging.getLogger(__name__)


[docs]class AbstractAnalysis(ABC, StateCheckMixin): """ Base class for analysis functionality. Provides basic functionality for state/args management in analysis hierarchy and helper method to access frequently-used methods. Analyses can be nested; the hierarchical relationships can be navigated via `parent` and `children` properties. The main entry method of analysis is `fit` function. This `_fit` method is designed to be overridden by the component developer and should encapsulate all the outputs into `state` object provided. When called, the execution flow is the following: - gather `args` from the parent levels of analysis hierarchy; this is done to avoid referencing same args on each nested component (i.e. `train_data` can be specified at the top and all the children will be able to access it via `args` on all levels (unless overridden by one of the components in the hierarchy) - call `_fit` function for each component that returned `True` from `can_handle` call Please note: `state` is shared across the whole analysis hierarchy. If two components change the same space, then it will be overridden by each consecutive update. If same component have to be reused and requires writing different outputs, please use :py:class:`~autogluon.eda.analysis.base.Namespace` wrapper to isolate the components. Parameters ---------- parent: Optional[AbstractAnalysis], default = None parent Analysis children: Optional[List[AbstractAnalysis]], default None wrapped analyses; these will receive sampled `args` during `fit` call state: Optional[AnalysisState], default = None state object to perform check on; if not provided a new state will be created during the `fit` call kwargs arguments to pass into the component See Also -------- :py:class:`~autogluon.eda.analysis.base.Namespace` """ def __init__( self, parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, state: Optional[AnalysisState] = None, **kwargs, ) -> None: self.parent = parent self.children: List[AbstractAnalysis] = [] if children is None else children self.state: Optional[AnalysisState] = state for c in self.children: c.parent = self c.state = self.state self.args = kwargs def _gather_args(self) -> AnalysisState: chain = [self] while chain[0].parent is not None: chain.insert(0, chain[0].parent) args = AnalysisState() for node in chain: args = AnalysisState({**args, **node.args}) return args
[docs] @staticmethod def available_datasets(args: AnalysisState) -> Generator[Tuple[str, DataFrame], None, None]: """ Generator which iterates only through the datasets provided in arguments Parameters ---------- args: AnalysisState arguments passed into the call. These are different from `self.args` in a way that it's arguments assembled from the parents and shadowed via children (allows to isolate reused parameters in upper arguments declarations. Returns ------- tuple of dataset name (train_data, test_data or tuning_data) and dataset itself """ for ds in ["train_data", "test_data", "tuning_data", "val_data"]: if ds in args and args[ds] is not None: df: DataFrame = args[ds] yield ds, df
def _get_state_from_parent(self) -> AnalysisState: state = self.state if state is None: if self.parent is None: state = AnalysisState() else: state = self.parent.state return state # type: ignore
[docs] @abstractmethod def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: """ Checks if state and args has all the required parameters for fitting. See also :func:`at_least_one_key_must_be_present` and :func:`all_keys_must_be_present` helpers to construct more complex logic. Parameters ---------- state: AnalysisState state to be updated by this fit function args: AnalysisState analysis properties assembled from root of analysis hierarchy to this component (with lower levels shadowing upper level args). Returns ------- `True` if all the pre-requisites for fitting are present """ raise NotImplementedError
@abstractmethod def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None: """ @override Component-specific internal processing. This method is designed to be overridden by the component developer Parameters ---------- state: AnalysisState state to be updated by this fit function args: AnalysisState analysis properties assembled from root of analysis hierarchy to this component (with lower levels shadowing upper level args). fit_kwargs arguments passed into fit call """ raise NotImplementedError
[docs] def fit(self, **kwargs) -> AnalysisState: """ Fit the analysis tree. Parameters ---------- kwargs fit arguments Returns ------- state produced by fit """ self.state = self._get_state_from_parent() if self.parent is not None: assert ( self.state is not None ), "Inner analysis fit() is called while parent has no state. Please call top-level analysis fit instead" _args = self._gather_args() if self.can_handle(self.state, _args): self._fit(self.state, _args, **kwargs) for c in self.children: c.fit(**kwargs) return self.state
[docs]class BaseAnalysis(AbstractAnalysis): """ Simple implementation of :py:class:`~autogluon.eda.analysis.base.AbstractAnalysis` Parameters ---------- parent: Optional[AbstractAnalysis], default = None parent Analysis children: Optional[List[AbstractAnalysis]], default None wrapped analyses; these will receive sampled `args` during `fit` call kwargs See Also -------- :py:class:`~autogluon.eda.analysis.base.AbstractAnalysis` """ def __init__( self, parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, **kwargs ) -> None: super().__init__(parent, children, **kwargs) def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: return True def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None: pass
[docs]class Namespace(AbstractAnalysis): """ Creates a nested namespace in state. All the components within `children` will have relative root of the state moved into this subspace. To instruct visualization facets to use a specific subspace, please use `namespace` argument (see the example). Parameters ---------- namespace: Optional[str], default = None namespace to use; use root if not specified parent: Optional[AbstractAnalysis], default = None parent Analysis children: Optional[List[AbstractAnalysis]], default None wrapped analyses; these will receive sampled `args` during `fit` call kwargs Examples -------- >>> import autogluon.eda.analysis as eda >>> import autogluon.eda.visualization as viz >>> import autogluon.eda.auto as auto >>> >>> auto.analyze( >>> train_data=..., label=..., >>> anlz_facets=[ >>> # Puts output into the root namespace >>> eda.interaction.Correlation(), >>> # Puts output into the focus namespace >>> eda.Namespace(namespace='focus', children=[ >>> eda.interaction.Correlation(focus_field='Fare', focus_field_threshold=0.3), >>> ]) >>> ], >>> viz_facets=[ >>> # Renders correlations from the root namespace >>> viz.interaction.CorrelationVisualization(), >>> # Renders correlations from the focus namespace >>> viz.interaction.CorrelationVisualization(namespace='focus'), >>> ] >>> ) """ def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: return True def __init__( self, namespace: Optional[str] = None, parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, **kwargs, ) -> None: super().__init__(parent, children, **kwargs) self.namespace = namespace def fit(self, **kwargs) -> AnalysisState: assert ( self.parent is not None ), "Namespace must be wrapped into other analysis. You can use BaseAnalysis of one is needed" return super().fit(**kwargs) def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None: pass def _get_state_from_parent(self) -> AnalysisState: state = super()._get_state_from_parent() if self.namespace not in state: state[self.namespace] = {} return state[self.namespace]
class SaveArgsToState(AbstractAnalysis): """ Helper to copy parameters from args to state. Can be helpful if some other transformation were performed on the args (i.e. feature generator) and the modified args has to be stored for other purposes Parameters ---------- params_mapping: Dict[str, str] mapping between args and state keys to be copied. I.e. `{'a': 'b'}` means `state.b = args.a` parent: Optional[AbstractAnalysis], default = None parent Analysis children: Optional[List[AbstractAnalysis]], default None wrapped analyses; these will receive sampled `args` during `fit` call state: AnalysisState state to be updated by this fit function kwargs """ def __init__( self, params_mapping: Dict[str, str], parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, state: Optional[AnalysisState] = None, **kwargs, ) -> None: super().__init__(parent, children, state, **kwargs) self.params_mapping = params_mapping def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: return self.all_keys_must_be_present(args, *self.params_mapping.keys()) def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None: for key_args, key_state in self.params_mapping.items(): state[key_state] = args[key_args]