Source code for autogluon.eda.analysis.base

from __future__ import annotations

import logging
from abc import ABC, abstractmethod
from typing import Generator, List, Optional, Tuple

from pandas import DataFrame

from ..state import AnalysisState, StateCheckMixin

logger = logging.getLogger(__name__)


[docs]class AbstractAnalysis(ABC, StateCheckMixin): def __init__( self, parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, state: Optional[AnalysisState] = None, **kwargs, ) -> None: self.parent = parent self.children: List[AbstractAnalysis] = [] if children is None else children self.state: Optional[AnalysisState] = state for c in self.children: c.parent = self c.state = self.state self.args = kwargs def _gather_args(self) -> AnalysisState: chain = [self] while chain[0].parent is not None: chain.insert(0, chain[0].parent) args = AnalysisState() for node in chain: args = AnalysisState({**args, **node.args}) return args
[docs] @staticmethod def available_datasets(args: AnalysisState) -> Generator[Tuple[str, DataFrame], None, None]: """ Generator which iterates only through the datasets provided in arguments Parameters ---------- args: AnalysisState arguments passed into the call. These are different from `self.args` in a way that it's arguments assembled from the parents and shadowed via children (allows to isolate reused parameters in upper arguments declarations. Returns ------- tuple of dataset name (train_data, test_data or tuning_data) and dataset itself """ for ds in ["train_data", "test_data", "tuning_data", "val_data"]: if ds in args and args[ds] is not None: df: DataFrame = args[ds] yield ds, df
def _get_state_from_parent(self) -> AnalysisState: state = self.state if state is None: if self.parent is None: state = AnalysisState() else: state = self.parent.state return state # type: ignore
[docs] @abstractmethod def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: """ Checks if state and args has all the required parameters for fitting. See also :func:`at_least_one_key_must_be_present` and :func:`all_keys_must_be_present` helpers to construct more complex logic. Parameters ---------- state: AnalysisState state to be updated by this fit function args: AnalysisState analysis properties assembled from root of analysis hierarchy to this component (with lower levels shadowing upper level args). Returns ------- `True` if all the pre-requisites for fitting are present """ raise NotImplementedError
@abstractmethod def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None: """ @override Component-specific internal processing. This method is designed to be overridden by the component developer Parameters ---------- state: AnalysisState state to be updated by this fit function args: AnalysisState analysis properties assembled from root of analysis hierarchy to this component (with lower levels shadowing upper level args). fit_kwargs arguments passed into fit call """ raise NotImplementedError
[docs] def fit(self, **kwargs) -> AnalysisState: """ Fit the analysis tree. Parameters ---------- kwargs fit arguments Returns ------- state produced by fit """ self.state = self._get_state_from_parent() if self.parent is not None: assert ( self.state is not None ), "Inner analysis fit() is called while parent has no state. Please call top-level analysis fit instead" _args = self._gather_args() if self.can_handle(self.state, _args): self._fit(self.state, _args, **kwargs) for c in self.children: c.fit(**kwargs) return self.state
class BaseAnalysis(AbstractAnalysis): def __init__( self, parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, **kwargs ) -> None: super().__init__(parent, children, **kwargs) def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: return True def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None: pass
[docs]class Namespace(AbstractAnalysis): """ Creates a nested namespace in state. All the components within `children` will have relative root of the state moved into this subspace. To instruct visualization facets to use a specific subspace, please use `namespace` argument (see the example). Parameters ---------- namespace: Optional[str], default = None namespace to use; use root if not specified parent: Optional[AbstractAnalysis], default = None parent Analysis children: Optional[List[AbstractAnalysis]], default None wrapped analyses; these will receive sampled `args` during `fit` call kwargs Examples -------- >>> import autogluon.eda.analysis as eda >>> import autogluon.eda.visualization as viz >>> import autogluon.eda.auto as auto >>> >>> auto.analyze( >>> train_data=..., label=..., >>> anlz_facets=[ >>> # Puts output into the root namespace >>> eda.interaction.Correlation(), >>> # Puts output into the focus namespace >>> eda.Namespace(namespace='focus', children=[ >>> eda.interaction.Correlation(focus_field='Fare', focus_field_threshold=0.3), >>> ]) >>> ], >>> viz_facets=[ >>> # Renders correlations from the root namespace >>> viz.interaction.CorrelationVisualization(), >>> # Renders correlations from the focus namespace >>> viz.interaction.CorrelationVisualization(namespace='focus'), >>> ] >>> ) """ def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: return True def __init__( self, namespace: Optional[str] = None, parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, **kwargs, ) -> None: super().__init__(parent, children, **kwargs) self.namespace = namespace def fit(self, **kwargs) -> AnalysisState: assert ( self.parent is not None ), "Namespace must be wrapped into other analysis. You can use BaseAnalysis of one is needed" return super().fit(**kwargs) def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None: pass def _get_state_from_parent(self) -> AnalysisState: state = super()._get_state_from_parent() if self.namespace not in state: state[self.namespace] = {} return state[self.namespace]