Source code for autogluon.eda.analysis.explain

import logging
import warnings
from typing import List, Optional

import numpy as np
import pandas as pd
import shap

from autogluon.core.constants import REGRESSION
from autogluon.eda import AnalysisState
from autogluon.eda.analysis.base import AbstractAnalysis

__all__ = ["ShapAnalysis"]

logger = logging.getLogger(__name__)


class _ShapAutoGluonWrapper:
    def __init__(self, predictor, feature_names, target_class=None):
        self.ag_model = predictor
        self.feature_names = feature_names
        self.target_class = target_class
        if target_class is None and predictor.problem_type != REGRESSION:
            logging.warning("Since target_class not specified, SHAP will explain predictions for each class")

    def predict_proba(self, X):
        if isinstance(X, pd.Series):
            X = X.values.reshape(1, -1)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names)
        if self.ag_model.problem_type == REGRESSION:
            preds = self.ag_model.predict(X)
        else:
            preds = self.ag_model.predict_proba(X)
        if self.ag_model.problem_type == REGRESSION or self.target_class is None:
            return preds
        else:
            return preds[self.target_class]


[docs]class ShapAnalysis(AbstractAnalysis): """ Perform Shapley values calculation using `shap` package for the given rows. Parameters ---------- rows: pd.DataFrame, rows to explain baseline_sample: int, default = 100 The background dataset size to use for integrating out features. To determine the impact of a feature, that feature is set to "missing" and the change in the model output is observed. parent: Optional[AbstractAnalysis], default = None parent Analysis children: List[AbstractAnalysis], default = [] wrapped analyses; these will receive sampled `args` during `fit` call state: AnalysisState state to be updated by this fit function random_state: int, default = 0 random state for sampling kwargs Examples -------- >>> import autogluon.eda.analysis as eda >>> import autogluon.eda.visualization as viz >>> import autogluon.eda.auto as auto >>> >>> auto.analyze( >>> train_data=..., model=..., >>> anlz_facets=[ >>> eda.explain.ShapAnalysis(rows, baseline_sample=200), >>> ], >>> viz_facets=[ >>> # Visualize the given SHAP values with an additive force layout >>> viz.explain.ExplainForcePlot(), >>> # Visualize the given SHAP values with a waterfall layout >>> viz.explain.ExplainWaterfallPlot(), >>> ] >>> ) See Also -------- :py:class:`~shap.KernelExplainer` :py:class:`~autogluon.eda.visualization.explain.ExplainForcePlot` :py:class:`~autogluon.eda.visualization.explain.ExplainWaterfallPlot` """ def __init__( self, rows: pd.DataFrame, baseline_sample: int = 100, parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, state: Optional[AnalysisState] = None, random_state: int = 0, **kwargs, ) -> None: super().__init__(parent, children, state, **kwargs) self.rows = rows self.baseline_sample = baseline_sample self.random_state = random_state def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: return self.all_keys_must_be_present(args, "model", "train_data") def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None: if self.baseline_sample <= len(args.train_data): _baseline_sample = self.baseline_sample else: _baseline_sample = len(args.train_data) baseline = args.train_data.sample(_baseline_sample, random_state=self.random_state) shap_data = [] for _, row in self.rows.iterrows(): _row = pd.DataFrame([row]) if args.model.problem_type == REGRESSION: predicted_class = None else: predicted_class = args.model.predict(_row).iloc[0] ag_wrapper = _ShapAutoGluonWrapper(args.model, args.train_data.columns, predicted_class) explainer = shap.KernelExplainer(ag_wrapper.predict_proba, baseline) with warnings.catch_warnings(): warnings.filterwarnings("ignore") # Suppress sklearn pipeline warnings np.int = int # type: ignore[attr-defined] # workaround to address shap's use of old numpy APIs ke_shap_values = explainer.shap_values(_row[args.train_data.columns], silent=True) shap_data.append( AnalysisState( row=_row, expected_value=explainer.expected_value, shap_values=ke_shap_values[0], features=row[args.model.original_features], feature_names=None, ) ) state.explain = {"shapley": shap_data}