Source code for autogluon.eda.analysis.model

from typing import Any, Dict, List, Optional, Union

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

from autogluon.core.constants import (
    BINARY,
    MULTICLASS,
    PROBLEM_TYPES_CLASSIFICATION,
    PROBLEM_TYPES_REGRESSION,
    REGRESSION,
)
from autogluon.tabular import TabularPredictor

from .base import AbstractAnalysis, AnalysisState

__all__ = ["AutoGluonModelEvaluator", "AutoGluonModelQuickFit"]


[docs]class AutoGluonModelQuickFit(AbstractAnalysis):
    """
    Fit a quick model using AutoGluon.

    `train_data`, `val_data` and `label` must be present in args.

    Note: this component can be wrapped into :py:class:`~autogluon.eda.analysis.dataset.TrainValidationSplit` and `~autogluon.eda.analysis.dataset.Sampler`
    to perform automated sampling and train-test split. This whole logic is implemented in :py:meth:`~autogluon.eda.auto.simple.quick_fit` shortcut.

    Examples
    --------
    >>> import autogluon.eda.analysis as eda
    >>>
    >>> # Quick fit
    >>> state = auto.quick_fit(
    >>>     train_data=..., label=...,
    >>>     return_state=True,  # return state object from call
    >>>     hyperparameters={'GBM': {}}  # train specific model
    >>> )
    >>>
    >>> # Using quick fit model
    >>> model = state.model
    >>> y_pred = model.predict(test_data)

    Parameters
    ----------
    problem_type: str, default = 'auto'
        problem type to use. Valid problem_type values include ['auto', 'binary', 'multiclass', 'regression', 'quantile', 'softclass']
        auto means it will be Auto-detected using AutoGluon methods.
    estimator_args: Optional[Dict[str, Any]], default = None,
        kwargs to pass into estimator constructor (`TabularPredictor`)
    save_model_to_state: bool, default = True,
        save fitted model into `state` under `model` key.
        This functionality might be helpful in cases when the fitted model could be usable for other purposes (i.e. imputers)
    parent: Optional[AbstractAnalysis], default = None
        parent Analysis
    children: Optional[List[AbstractAnalysis]], default None
        wrapped analyses; these will receive sampled `args` during `fit` call
    kwargs

    See Also
    --------
    :py:meth:`~autogluon.eda.auto.simple.quick_fit`
    :py:class:`~autogluon.eda.analysis.dataset.TrainValidationSplit`
    :py:class:`~autogluon.eda.analysis.dataset.Sampler`

    """

    def __init__(
        self,
        problem_type: str = "auto",
        estimator_args: Optional[Dict[str, Any]] = None,
        parent: Optional[AbstractAnalysis] = None,
        children: Optional[List[AbstractAnalysis]] = None,
        save_model_to_state: bool = True,
        **kwargs,
    ) -> None:
        super().__init__(parent, children, **kwargs)

        valid_problem_types = ["auto"] + PROBLEM_TYPES_REGRESSION + PROBLEM_TYPES_CLASSIFICATION
        assert problem_type in valid_problem_types, f"Valid problem_type values include {valid_problem_types}"
        self.problem_type: Optional[str] = None if problem_type == "auto" else problem_type

        self.save_model_to_state = save_model_to_state

        if estimator_args is not None:
            self.estimator_args = estimator_args
        else:
            self.estimator_args = {}

    def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool:
        return self.all_keys_must_be_present(args, "train_data", "val_data", "label")

    def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None:
        estimator: TabularPredictor = TabularPredictor(
            label=args.label, problem_type=self.problem_type, **self.estimator_args
        )
        estimator.fit(train_data=args.train_data, **self.args)
        self.args["model"] = estimator

        if self.save_model_to_state:
            state["model"] = estimator


[docs]class AutoGluonModelEvaluator(AbstractAnalysis):
    """
    Evaluate AutoGluon model performance.

    This analysis requires a trained classifier passed in `model` arg and uses 'val_data' dataset to assess model performance.

    It is assumed that the validation dataset should follow the same column names seen by the model and has not been used during the training process.

    Parameters
    ----------
    model: TabularPredictor, required
        fitted AutoGluon model to analyze
    val_data: pd.DataFrame, required
        validation dataset to use.
        Warning: do not use data used for training as a validation data.
        Predictions on data used by the model during training tend to be optimistic and might not generalize on unseen data.
    normalize : {'true', 'pred', 'all'}, default=None
        Normalizes confusion matrix over the true (rows), predicted (columns)
        conditions or all the population. If None, confusion matrix will not be
        normalized.
        Note: applicable only for binary and multiclass classification; ignored for regression models.
    parent: Optional[AbstractAnalysis], default = None
        parent Analysis
    children: Optional[List[AbstractAnalysis]], default None
        wrapped analyses; these will receive sampled `args` during `fit` call

    Examples
    --------
    >>> import autogluon.eda.analysis as eda
    >>> import autogluon.eda.visualization as viz
    >>> import autogluon.eda.auto as auto
    >>>
    >>> df_train = ...
    >>> df_test = ...
    >>> predictor = ...
    >>>
    >>> auto.analyze(model=predictor, val_data=df_test, anlz_facets=[
    >>>     eda.model.AutoGluonModelEvaluator(),
    >>> ], viz_facets=[
    >>>     viz.layouts.MarkdownSectionComponent(markdown=f'### Model Prediction for {predictor.label}'),
    >>>     viz.model.ConfusionMatrix(fig_args=dict(figsize=(3,3)), annot_kws={"size": 12}),
    >>>     viz.model.RegressionEvaluation(fig_args=dict(figsize=(6,6)), chart_args=dict(marker='o', scatter_kws={'s':5})),
    >>>     viz.layouts.MarkdownSectionComponent(markdown=f'### Feature Importance for Trained Model'),
    >>>     viz.model.FeatureImportance(show_barplots=True)
    >>> ])
    """

    def __init__(
        self,
        normalize: Union[None, str] = None,
        parent: Optional[AbstractAnalysis] = None,
        children: Optional[List[AbstractAnalysis]] = None,
        **kwargs,
    ) -> None:
        super().__init__(parent, children, **kwargs)
        self.normalize = normalize

    def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool:
        keys_present = self.all_keys_must_be_present(args, "model", "val_data")
        data_cols = sorted(args.val_data.columns.values)
        model_cols = sorted(args.model.original_features + [args.model.label])
        columns_the_same = data_cols == model_cols if keys_present else False
        if not columns_the_same:
            self.logger.warning(
                f"val_data columns {data_cols} are not matching original features model was trained on: {model_cols}"
            )

        return keys_present and columns_the_same

    def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs):
        predictor: TabularPredictor = args.model
        val_data = args.val_data
        problem_type = predictor.problem_type
        label = predictor.label
        y_true_val, y_pred_val, highest_error, undecided = self._predict(problem_type, predictor, val_data)
        test_data = args.test_data
        test_data_present = args.test_data is not None and label in args.test_data.columns

        y_true_test = None
        y_pred_test = None
        if test_data_present:
            test_data = args.test_data
            y_true_test, y_pred_test, highest_error, undecided = self._predict(problem_type, predictor, test_data)

        _data = test_data if test_data_present else val_data
        importance = predictor.feature_importance(_data.reset_index(drop=True), silent=True)
        leaderboard = predictor.leaderboard(_data, silent=True)

        labels = predictor.class_labels
        s = {
            "problem_type": predictor.problem_type,
            "importance": importance,
            "leaderboard": leaderboard,
            "labels": labels,
            "y_true_val": y_true_val,
            "y_pred_val": y_pred_val,
        }

        try:
            y_pred_train = args.model.get_oof_pred_proba()
            s["y_true_train"] = args.train_data[args.label]
            s["y_pred_train"] = y_pred_train
        except AssertionError:
            # OOF is not available - don't use it
            pass

        if test_data_present:
            s["y_true_test"] = y_true_test
            s["y_pred_test"] = y_pred_test

        if undecided is not None:
            s["undecided"] = undecided
        if highest_error is not None:
            s["highest_error"] = highest_error

        if problem_type in [BINARY, MULTICLASS]:
            cm = confusion_matrix(y_true_val, y_pred_val, normalize=self.normalize, labels=labels)
            s["confusion_matrix_normalized"] = self.normalize is not None
            s["confusion_matrix"] = cm

        state.model_evaluation = s

    def _predict(self, problem_type, predictor, val_data):
        label = predictor.label
        y_true_val = val_data[label]
        y_pred_val = predictor.predict(val_data)
        highest_error = None
        undecided = None
        if predictor.problem_type in [BINARY, MULTICLASS]:
            y_proba = predictor.predict_proba(val_data)

            misclassified = y_proba[y_true_val != y_pred_val]
            expected_value = misclassified.join(y_true_val).apply(lambda row: row.loc[row[label]], axis=1)
            predicted_value = misclassified.max(axis=1)
            highest_error = predicted_value - expected_value
            highest_error.name = "error"

            scores = np.sort(misclassified.values, axis=1)
            diff = scores[:, -1] - expected_value
            undecided = pd.Series(index=y_pred_val.index, data=diff, name="error").sort_values(ascending=True)
            undecided = val_data.join(y_proba).join(undecided).sort_values(by="error")
            highest_error = (
                val_data.join(y_proba, rsuffix="_pred")
                .join(highest_error, how="inner")
                .sort_values(by="error", ascending=False)
            )
        elif problem_type == REGRESSION:
            highest_error = np.abs(y_pred_val - y_true_val).sort_values(ascending=False)
            highest_error.name = "error"
            highest_error = (
                val_data.join(y_pred_val, rsuffix="_pred")
                .join(highest_error, how="inner")
                .sort_values(by="error", ascending=False)
            )
        return y_true_val, y_pred_val, highest_error, undecided