Source code for autogluon.core.models.ensemble.weighted_ensemble_model

import logging
from collections import defaultdict

import numpy as np
import pandas as pd

from .stacker_ensemble_model import StackerEnsembleModel
from ..greedy_ensemble.greedy_weighted_ensemble_model import GreedyWeightedEnsembleModel

logger = logging.getLogger(__name__)


# TODO: v0.1 see if this can be removed and logic moved to greedy weighted ensemble model -> Use StackerEnsembleModel as stacker instead
# TODO: Optimize predict speed when fit on kfold, can simply sum weights
[docs]class WeightedEnsembleModel(StackerEnsembleModel): """ Weighted ensemble meta-model that implements Ensemble Selection: https://www.cs.cornell.edu/~alexn/papers/shotgun.icml04.revised.rev2.pdf A :class:`autogluon.core.models.GreedyWeightedEnsembleModel` must be specified as the `model_base` to properly function. """ def __init__(self, **kwargs): super().__init__(**kwargs) self.low_memory = False def _fit(self, X, y, **kwargs): super()._fit(X, y, **kwargs) stack_columns = [] for model in self.models: model = self.load_child(model, verbose=False) stack_columns = stack_columns + [stack_column for stack_column in model.base_model_names if stack_column not in stack_columns] self.stack_column_prefix_lst = [stack_column for stack_column in self.stack_column_prefix_lst if stack_column in stack_columns] self.stack_columns, self.num_pred_cols_per_model = self.set_stack_columns(stack_column_prefix_lst=self.stack_column_prefix_lst) min_stack_column_prefix_to_model_map = {k: v for k, v in self.stack_column_prefix_to_model_map.items() if k in self.stack_column_prefix_lst} self.base_model_names = [base_model_name for base_model_name in self.base_model_names if base_model_name in min_stack_column_prefix_to_model_map.values()] self.stack_column_prefix_to_model_map = min_stack_column_prefix_to_model_map def _get_model_weights(self): weights_dict = defaultdict(int) num_models = len(self.models) for model in self.models: model: GreedyWeightedEnsembleModel = self.load_child(model, verbose=False) model_weight_dict = model._get_model_weights() for key in model_weight_dict.keys(): weights_dict[key] += model_weight_dict[key] for key in weights_dict: weights_dict[key] = weights_dict[key] / num_models return weights_dict def compute_feature_importance(self, X, y, features=None, is_oof=True, **kwargs) -> pd.DataFrame: logger.warning('Warning: non-raw feature importance calculation is not valid for weighted ensemble since it does not have features, returning ensemble weights instead...') if is_oof: fi = pd.Series(self._get_model_weights()).sort_values(ascending=False) else: logger.warning('Warning: Feature importance calculation is not yet implemented for WeightedEnsembleModel on unseen data, returning generic feature importance...') fi = pd.Series(self._get_model_weights()).sort_values(ascending=False) fi_df = fi.to_frame(name='importance') fi_df['stddev'] = np.nan fi_df['p_score'] = np.nan fi_df['n'] = np.nan # TODO: Rewrite preprocess() in greedy_weighted_ensemble_model to enable # fi_df = super().compute_feature_importance(X=X, y=y, features_to_use=features_to_use, preprocess=preprocess, is_oof=is_oof, **kwargs) return fi_df def _set_default_params(self): default_params = {'use_orig_features': False} for param, val in default_params.items(): self._set_default_param_value(param, val) super()._set_default_params()