Source code for autogluon.eda.analysis.model

from typing import Any, Dict, List, Optional, Union

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

from autogluon.core.constants import (
    BINARY,
    MULTICLASS,
    PROBLEM_TYPES_CLASSIFICATION,
    PROBLEM_TYPES_REGRESSION,
    REGRESSION,
)
from autogluon.tabular import TabularPredictor

from .base import AbstractAnalysis, AnalysisState

__all__ = ["AutoGluonModelEvaluator", "AutoGluonModelQuickFit"]


[docs]class AutoGluonModelQuickFit(AbstractAnalysis): """ Fit a quick model using AutoGluon. `train_data`, `val_data` and `label` must be present in args. Note: this component can be wrapped into :py:class:`~autogluon.eda.analysis.dataset.TrainValidationSplit` and `~autogluon.eda.analysis.dataset.Sampler` to perform automated sampling and train-test split. This whole logic is implemented in :py:meth:`~autogluon.eda.auto.simple.quick_fit` shortcut. Examples -------- >>> import autogluon.eda.analysis as eda >>> >>> # Quick fit >>> state = auto.quick_fit( >>> train_data=..., label=..., >>> return_state=True, # return state object from call >>> save_model_to_state=True, # store fitted model into the state >>> hyperparameters={'GBM': {}} # train specific model >>> ) >>> >>> # Using quick fit model >>> model = state.model >>> y_pred = model.predict(test_data) Parameters ---------- problem_type: str, default = 'auto' problem type to use. Valid problem_type values include ['auto', 'binary', 'multiclass', 'regression', 'quantile', 'softclass'] auto means it will be Auto-detected using AutoGluon methods. estimator_args: Optional[Dict[str, Any]], default = None, kwargs to pass into estimator constructor (`TabularPredictor`) save_model_to_state: bool, default = False, save fitted model into `state` under `model` key. This functionality might be helpful in cases when the fitted model could be usable for other purposes (i.e. imputers) parent: Optional[AbstractAnalysis], default = None parent Analysis children: Optional[List[AbstractAnalysis]], default None wrapped analyses; these will receive sampled `args` during `fit` call kwargs See Also -------- :py:meth:`~autogluon.eda.auto.simple.quick_fit` :py:class:`~autogluon.eda.analysis.dataset.TrainValidationSplit` :py:class:`~autogluon.eda.analysis.dataset.Sampler` """ def __init__( self, problem_type: str = "auto", estimator_args: Optional[Dict[str, Any]] = None, parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, save_model_to_state: bool = False, **kwargs, ) -> None: super().__init__(parent, children, **kwargs) valid_problem_types = ["auto"] + PROBLEM_TYPES_REGRESSION + PROBLEM_TYPES_CLASSIFICATION assert problem_type in valid_problem_types, f"Valid problem_type values include {valid_problem_types}" self.problem_type: Optional[str] = None if problem_type == "auto" else problem_type self.save_model_to_state = save_model_to_state if estimator_args is not None: self.estimator_args = estimator_args else: self.estimator_args = {} def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: return self.all_keys_must_be_present(args, "train_data", "val_data", "label") def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None: estimator: TabularPredictor = TabularPredictor( label=args.label, problem_type=self.problem_type, **self.estimator_args ) estimator.fit(train_data=args.train_data, **self.args) self.args["model"] = estimator if self.save_model_to_state: state["model"] = estimator
[docs]class AutoGluonModelEvaluator(AbstractAnalysis): """ Evaluate AutoGluon model performance. This analysis requires a trained classifier passed in `model` arg and uses 'val_data' dataset to assess model performance. It is assumed that the validation dataset should follow the same column names seen by the model and has not been used during the training process. Parameters ---------- model: TabularPredictor, required fitted AutoGluon model to analyze val_data: pd.DataFrame, required validation dataset to use. Warning: do not use data used for training as a validation data. Predictions on data used by the model during training tend to be optimistic and might not generalize on unseen data. normalize : {'true', 'pred', 'all'}, default=None Normalizes confusion matrix over the true (rows), predicted (columns) conditions or all the population. If None, confusion matrix will not be normalized. Note: applicable only for binary and multiclass classification; ignored for regression models. parent: Optional[AbstractAnalysis], default = None parent Analysis children: Optional[List[AbstractAnalysis]], default None wrapped analyses; these will receive sampled `args` during `fit` call Examples -------- >>> import autogluon.eda.analysis as eda >>> import autogluon.eda.visualization as viz >>> import autogluon.eda.auto as auto >>> >>> df_train = ... >>> df_test = ... >>> predictor = ... >>> >>> auto.analyze(model=predictor, val_data=df_test, anlz_facets=[ >>> eda.model.AutoGluonModelEvaluator(), >>> ], viz_facets=[ >>> viz.layouts.MarkdownSectionComponent(markdown=f'### Model Prediction for {predictor.label}'), >>> viz.model.ConfusionMatrix(fig_args=dict(figsize=(3,3)), annot_kws={"size": 12}), >>> viz.model.RegressionEvaluation(fig_args=dict(figsize=(6,6)), chart_args=dict(marker='o', scatter_kws={'s':5})), >>> viz.layouts.MarkdownSectionComponent(markdown=f'### Feature Importance for Trained Model'), >>> viz.model.FeatureImportance(show_barplots=True) >>> ]) """ def __init__( self, normalize: Union[None, str] = None, parent: Optional[AbstractAnalysis] = None, children: Optional[List[AbstractAnalysis]] = None, **kwargs, ) -> None: super().__init__(parent, children, **kwargs) self.normalize = normalize def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool: keys_present = self.all_keys_must_be_present(args, "model", "val_data") data_cols = sorted(args.val_data.columns.values) model_cols = sorted(args.model.original_features + [args.model.label]) columns_the_same = data_cols == model_cols if keys_present else False if not columns_the_same: self.logger.warning( f"val_data columns {data_cols} are not matching original features model was trained on: {model_cols}" ) return keys_present and columns_the_same def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs): predictor: TabularPredictor = args.model val_data = args.val_data problem_type = predictor.problem_type label = predictor.label y_true = val_data[label] y_pred = predictor.predict(val_data) highest_error = None undecided = None if predictor.problem_type in [BINARY, MULTICLASS]: y_proba = predictor.predict_proba(val_data) highest_error = y_proba[y_true != y_pred].max(axis=1) highest_error.name = "error" scores = np.sort(y_proba.values, axis=1) diff = scores[:, -1] - scores[:, -2] undecided = pd.Series(index=y_pred.index, data=diff, name="score_diff").sort_values(ascending=True) undecided = val_data.join(y_proba).join(undecided).sort_values(by="score_diff") highest_error = ( val_data.join(y_proba, rsuffix="_pred") .join(highest_error, how="inner") .sort_values(by="error", ascending=False) ) elif problem_type == REGRESSION: highest_error = np.abs(y_pred - y_true).sort_values(ascending=False) highest_error.name = "error" highest_error = ( val_data.join(y_pred, rsuffix="_pred") .join(highest_error, how="inner") .sort_values(by="error", ascending=False) ) importance = predictor.feature_importance(val_data.reset_index(drop=True), silent=True) leaderboard = predictor.leaderboard(val_data, silent=True) labels = predictor.class_labels s = { "problem_type": predictor.problem_type, "y_true": y_true, "y_pred": y_pred, "importance": importance, "leaderboard": leaderboard, "labels": labels, } if undecided is not None: s["undecided"] = undecided if highest_error is not None: s["highest_error"] = highest_error if problem_type in [BINARY, MULTICLASS]: cm = confusion_matrix(y_true, y_pred, normalize=self.normalize, labels=labels) s["confusion_matrix_normalized"] = self.normalize is not None s["confusion_matrix"] = cm state.model_evaluation = s