Source code for autogluon.core.models.ensemble.bagged_ensemble_model
import copy
import logging
import os
import time
from collections import Counter
from statistics import mean
import numpy as np
import pandas as pd
from .fold_fitting_strategy import AbstractFoldFittingStrategy, SequentialLocalFoldFittingStrategy
from ..abstract.abstract_model import AbstractModel
from ...constants import MULTICLASS, REGRESSION, SOFTCLASS, QUANTILE, REFIT_FULL_SUFFIX
from ...utils.exceptions import TimeLimitExceeded
from ...utils.loaders import load_pkl
from ...utils.savers import save_pkl
from ...utils.utils import CVSplitter, _compute_fi_with_stddev
logger = logging.getLogger(__name__)
# TODO: Add metadata object with info like score on each model, train time on each model, etc.
[docs]class BaggedEnsembleModel(AbstractModel):
"""
Bagged ensemble meta-model which fits a given model multiple times across different splits of the training data.
For certain child models such as KNN, this may only train a single model and instead rely on the child model to generate out-of-fold predictions.
"""
_oof_filename = 'oof.pkl'
def __init__(self, model_base: AbstractModel, random_state=0, **kwargs):
self.model_base = model_base
self._child_type = type(self.model_base)
self.models = []
self._oof_pred_proba = None
self._oof_pred_model_repeats = None
self._n_repeats = 0 # Number of n_repeats with at least 1 model fit, if kfold=5 and 8 models have been fit, _n_repeats is 2
self._n_repeats_finished = 0 # Number of n_repeats finished, if kfold=5 and 8 models have been fit, _n_repeats_finished is 1
self._k_fold_end = 0 # Number of models fit in current n_repeat (0 if completed), if kfold=5 and 8 models have been fit, _k_fold_end is 3
self._k = None # k models per n_repeat, equivalent to kfold value
self._k_per_n_repeat = [] # k-fold used for each n_repeat. == [5, 10, 3] if first kfold was 5, second was 10, and third was 3
self._random_state = random_state
self.low_memory = True
self._bagged_mode = None
# _child_oof currently is only set to True for KNN models, that are capable of LOO prediction generation to avoid needing bagging.
# TODO: Consider moving `_child_oof` logic to a separate class / refactor OOF logic.
# FIXME: Avoid unnecessary refit during refit_full on `_child_oof=True` models, just re-use the original model.
self._child_oof = False # Whether the OOF preds were taken from a single child model (Assumes child can produce OOF preds without bagging).
self._cv_splitters = [] # Keeps track of the CV splitter used for each bagged repeat.
super().__init__(problem_type=self.model_base.problem_type, eval_metric=self.model_base.eval_metric, **kwargs)
def _set_default_params(self):
default_params = {
# 'use_child_oof': False, # [Advanced] Whether to defer to child model for OOF preds and only train a single child.
'save_bag_folds': True,
# 'refit_folds': False, # [Advanced, Experimental] Whether to refit bags immediately to a refit_full model in a single .fit call.
}
for param, val in default_params.items():
self._set_default_param_value(param, val)
super()._set_default_params()
def _get_default_auxiliary_params(self) -> dict:
default_auxiliary_params = super()._get_default_auxiliary_params()
extra_auxiliary_params = dict(
drop_unique=False, # TODO: Get the value from child instead
)
default_auxiliary_params.update(extra_auxiliary_params)
return default_auxiliary_params
def is_valid(self):
return self.is_fit() and (self._n_repeats == self._n_repeats_finished)
def can_infer(self):
return self.is_fit() and self.params.get('save_bag_folds', True)
def is_stratified(self):
if self.problem_type in [REGRESSION, QUANTILE, SOFTCLASS]:
return False
else:
return True
def is_fit(self):
return len(self.models) != 0
def can_fit(self) -> bool:
return not self.is_fit() or self._bagged_mode
def is_valid_oof(self):
return self.is_fit() and (self._child_oof or self._bagged_mode)
def get_oof_pred_proba(self, **kwargs):
# TODO: Require is_valid == True (add option param to ignore is_valid)
return self._oof_pred_proba_func(self._oof_pred_proba, self._oof_pred_model_repeats)
@staticmethod
def _oof_pred_proba_func(oof_pred_proba, oof_pred_model_repeats):
oof_pred_model_repeats_without_0 = np.where(oof_pred_model_repeats == 0, 1, oof_pred_model_repeats)
if oof_pred_proba.ndim == 2:
oof_pred_model_repeats_without_0 = oof_pred_model_repeats_without_0[:, None]
return oof_pred_proba / oof_pred_model_repeats_without_0
def _init_misc(self, **kwargs):
child = self._get_model_base().convert_to_template()
child.initialize(**kwargs)
self.eval_metric = child.eval_metric
self.stopping_metric = child.stopping_metric
self.quantile_levels = child.quantile_levels
self.normalize_pred_probas = child.normalize_pred_probas
def preprocess(self, X, preprocess_nonadaptive=True, model=None, **kwargs):
if preprocess_nonadaptive:
if model is None:
if not self.models:
return X
model = self.models[0]
model = self.load_child(model)
return model.preprocess(X, preprocess_stateful=False)
else:
return X
def _get_cv_splitter(self, n_splits, n_repeats, groups=None):
return CVSplitter(n_splits=n_splits, n_repeats=n_repeats, groups=groups, stratified=self.is_stratified(), random_state=self._random_state)
def _fit(self,
X,
y,
X_val=None,
y_val=None,
k_fold=None,
k_fold_start=0,
k_fold_end=None,
n_repeats=1,
n_repeat_start=0,
groups=None,
**kwargs):
use_child_oof = self.params.get('use_child_oof', False)
if use_child_oof:
if self.is_fit():
# TODO: We may want to throw an exception instead and avoid calling fit more than once
return self
k_fold = 1
k_fold_end = None
groups = None
if k_fold is None and groups is None:
k_fold = 5
if k_fold is not None and k_fold < 1:
k_fold = 1
if k_fold is None or k_fold > 1:
k_fold = self._get_cv_splitter(n_splits=k_fold, n_repeats=n_repeats, groups=groups).n_splits
self._validate_bag_kwargs(
k_fold=k_fold,
k_fold_start=k_fold_start,
k_fold_end=k_fold_end,
n_repeats=n_repeats,
n_repeat_start=n_repeat_start,
groups=groups,
)
if k_fold_end is None:
k_fold_end = k_fold
model_base = self._get_model_base()
model_base.rename(name='')
kwargs['feature_metadata'] = self.feature_metadata
kwargs['num_classes'] = self.num_classes # TODO: maybe don't pass num_classes to children
if self.model_base is not None:
self.save_model_base(self.model_base)
self.model_base = None
if self._oof_pred_proba is None and self.is_fit():
self._load_oof()
save_bag_folds = self.params.get('save_bag_folds', True)
if k_fold == 1:
self._fit_single(X=X, y=y, model_base=model_base, use_child_oof=use_child_oof, **kwargs)
return self
else:
refit_folds = self.params.get('refit_folds', False)
if refit_folds:
save_bag_folds = False
if kwargs.get('time_limit', None) is not None:
fold_start = n_repeat_start * k_fold + k_fold_start
fold_end = (n_repeats - 1) * k_fold + k_fold_end
folds_to_fit = fold_end - fold_start
# Reserve time for final refit model
kwargs['time_limit'] = kwargs['time_limit'] * folds_to_fit / (folds_to_fit + 1.2)
self._fit_folds(X=X, y=y, model_base=model_base, k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold_end,
n_repeats=n_repeats, n_repeat_start=n_repeat_start, save_folds=save_bag_folds, groups=groups, **kwargs)
# FIXME: Don't save folds except for refit
# FIXME: Cleanup self
# FIXME: Don't add `_FULL` to name
if refit_folds:
refit_template = self.convert_to_refit_full_template()
refit_template.params['use_child_oof'] = False
kwargs['time_limit'] = None
refit_template.fit(X=X, y=y, k_fold=1, **kwargs)
refit_template._oof_pred_proba = self._oof_pred_proba
refit_template._oof_pred_model_repeats = self._oof_pred_model_repeats
refit_template._child_oof = True
refit_template.fit_time += self.fit_time + self.predict_time
return refit_template
else:
return self
def _validate_bag_kwargs(self, *,
k_fold,
k_fold_start,
k_fold_end,
n_repeats,
n_repeat_start,
groups):
if groups is not None:
if self._n_repeats_finished != 0:
raise AssertionError('Bagged models cannot call fit with `groups` specified when a full k-fold set has already been fit.')
if n_repeats > 1:
raise AssertionError('Cannot perform repeated bagging with `groups` specified.')
return
if k_fold_end is None:
k_fold_end = k_fold
if k_fold is None:
raise ValueError('k_fold cannot be None.')
if k_fold < 1:
raise ValueError(f'k_fold must be equal or greater than 1, value: ({k_fold})')
if n_repeat_start != self._n_repeats_finished:
raise ValueError(f'n_repeat_start must equal self._n_repeats_finished, values: ({n_repeat_start}, {self._n_repeats_finished})')
if n_repeats <= n_repeat_start:
raise ValueError(f'n_repeats must be greater than n_repeat_start, values: ({n_repeats}, {n_repeat_start})')
if k_fold_start != self._k_fold_end:
raise ValueError(f'k_fold_start must equal previous k_fold_end, values: ({k_fold_start}, {self._k_fold_end})')
if k_fold_start >= k_fold_end:
# TODO: Remove this limitation if n_repeats > 1
raise ValueError(f'k_fold_end must be greater than k_fold_start, values: ({k_fold_end}, {k_fold_start})')
if (n_repeats - n_repeat_start) > 1 and k_fold_end != k_fold:
# TODO: Remove this limitation
raise ValueError(f'k_fold_end must equal k_fold when (n_repeats - n_repeat_start) > 1, values: ({k_fold_end}, {k_fold})')
if self._k is not None and self._k != k_fold:
raise ValueError(f'k_fold must equal previously fit k_fold value for the current n_repeat, values: (({k_fold}, {self._k})')
def predict_proba(self, X, normalize=None, **kwargs):
model = self.load_child(self.models[0])
X = self.preprocess(X, model=model, **kwargs)
pred_proba = model.predict_proba(X=X, preprocess_nonadaptive=False, normalize=normalize)
for model in self.models[1:]:
model = self.load_child(model)
pred_proba += model.predict_proba(X=X, preprocess_nonadaptive=False, normalize=normalize)
pred_proba = pred_proba / len(self.models)
return pred_proba
def _predict_proba(self, X, normalize=False, **kwargs):
return self.predict_proba(X=X, normalize=normalize, **kwargs)
def score_with_oof(self, y, sample_weight=None):
self._load_oof()
valid_indices = self._oof_pred_model_repeats > 0
y = y[valid_indices]
y_pred_proba = self.get_oof_pred_proba()[valid_indices]
if sample_weight is not None:
sample_weight = sample_weight[valid_indices]
return self.score_with_y_pred_proba(y=y, y_pred_proba=y_pred_proba, sample_weight=sample_weight)
def _fit_single(self, X, y, model_base, use_child_oof, time_limit=None, **kwargs):
if self.is_fit():
raise AssertionError('Model is already fit.')
if self._n_repeats != 0:
raise ValueError(f'n_repeats must equal 0 when fitting a single model with k_fold == 1, value: {self._n_repeats}')
model_base.name = f'{model_base.name}S1F1'
model_base.set_contexts(path_context=self.path + model_base.name + os.path.sep)
time_start_fit = time.time()
model_base.fit(X=X, y=y, time_limit=time_limit, **kwargs)
model_base.fit_time = time.time() - time_start_fit
model_base.predict_time = None
X_len = len(X)
# Check if pred_proba is going to take too long
if time_limit is not None and X_len >= 10000:
max_allowed_time = time_limit * 1.3 # allow some buffer
time_left = max(
max_allowed_time - model_base.fit_time,
time_limit * 0.1, # At least 10% of time_limit
10, # At least 10 seconds
)
# Sample at most 500 rows to estimate prediction time of all rows
# TODO: Consider moving this into end of abstract model fit for all models.
# Currently this only fixes problem when in bagged mode, if not bagging, then inference could still be problamatic
n_sample = min(500, round(X_len * 0.1))
frac = n_sample / X_len
X_sample = X.sample(n=n_sample)
time_start_predict = time.time()
model_base.predict_proba(X_sample)
time_predict_frac = time.time() - time_start_predict
time_predict_estimate = time_predict_frac / frac
logger.log(15, f'\t{round(time_predict_estimate, 2)}s\t= Estimated out-of-fold prediction time...')
if time_predict_estimate > time_left:
logger.warning(f'\tNot enough time to generate out-of-fold predictions for model. Estimated time required was {round(time_predict_estimate, 2)}s compared to {round(time_left, 2)}s of available time.')
raise TimeLimitExceeded
if use_child_oof:
logger.log(15, '\t`use_child_oof` was specified for this model. It will function similarly to a bagged model, but will only fit one child model.')
time_start_predict = time.time()
if model_base._get_tags().get('valid_oof', False):
self._oof_pred_proba = model_base.get_oof_pred_proba(X=X, y=y)
else:
logger.warning('\tWARNING: `use_child_oof` was specified but child model does not have a dedicated `get_oof_pred_proba` method. This model may have heavily overfit validation scores.')
self._oof_pred_proba = model_base.predict_proba(X=X)
self._child_oof = True
model_base.predict_time = time.time() - time_start_predict
model_base.val_score = model_base.score_with_y_pred_proba(y=y, y_pred_proba=self._oof_pred_proba)
else:
self._oof_pred_proba = model_base.predict_proba(X=X) # TODO: Cheater value, will be overfit to valid set
self._oof_pred_model_repeats = np.ones(shape=len(X), dtype=np.uint8)
self._n_repeats = 1
self._n_repeats_finished = 1
self._k_per_n_repeat = [1]
self._bagged_mode = False
model_base.reduce_memory_size(remove_fit=True, remove_info=False, requires_save=True)
if not self.params.get('save_bag_folds', True):
model_base.model = None
if self.low_memory:
self.save_child(model_base, verbose=False)
self.models = [model_base.name]
else:
self.models = [model_base]
self._add_child_times_to_bag(model=model_base)
def _fit_folds(self,
X,
y,
model_base,
k_fold=None,
k_fold_start=0,
k_fold_end=None,
n_repeats=1,
n_repeat_start=0,
time_limit=None,
sample_weight=None,
save_folds=True,
groups=None,
**kwargs):
fold_fitting_strategy = self.params.get('fold_fitting_strategy', SequentialLocalFoldFittingStrategy)
# TODO: Preprocess data here instead of repeatedly
# FIXME: Raise exception if multiclass/binary and a single val fold contains all instances of a class. (Can happen if custom groups is specified)
time_start = time.time()
if k_fold_start != 0:
cv_splitter = self._cv_splitters[n_repeat_start]
else:
cv_splitter = self._get_cv_splitter(n_splits=k_fold, n_repeats=n_repeats, groups=groups)
if k_fold != cv_splitter.n_splits:
k_fold = cv_splitter.n_splits
if k_fold_end is None:
k_fold_end = k_fold
kfolds = cv_splitter.split(X=X, y=y)
oof_pred_proba, oof_pred_model_repeats = self._construct_empty_oof(X=X, y=y)
models = []
fold_start = n_repeat_start * k_fold + k_fold_start
fold_end = (n_repeats - 1) * k_fold + k_fold_end
folds_to_fit = fold_end - fold_start
# noinspection PyCallingNonCallable
fold_fitting_strategy: AbstractFoldFittingStrategy = fold_fitting_strategy(
self, X, y, sample_weight, time_limit, time_start, models, oof_pred_proba, oof_pred_model_repeats, save_folds=save_folds)
for j in range(n_repeat_start, n_repeats): # For each n_repeat
if j != n_repeat_start or k_fold_start == 0:
self._cv_splitters.append(cv_splitter)
cur_repeat_count = j - n_repeat_start
fold_start_n_repeat = fold_start + cur_repeat_count * k_fold
fold_end_n_repeat = min(fold_start_n_repeat + k_fold, fold_end)
for i in range(fold_start_n_repeat, fold_end_n_repeat): # For each fold
fold_num_in_repeat = i - (j * k_fold) # The fold in the current repeat set (first fold in set = 0)
fold_ctx = dict(
model_name_suffix=f'S{j + 1}F{fold_num_in_repeat + 1}', # S5F3 = 3rd fold of the 5th repeat set
fold=kfolds[i],
is_last_fold=i != (fold_end - 1),
folds_to_fit=folds_to_fit,
folds_finished=i - fold_start,
folds_left=fold_end - i,
)
fold_fitting_strategy.schedule_fold_model_fit(model_base, fold_ctx, kwargs)
if (fold_end_n_repeat != fold_end) or (k_fold == k_fold_end):
self._k_per_n_repeat.append(k_fold)
fold_fitting_strategy.after_all_folds_scheduled()
self.models += models
self._bagged_mode = True
if self._oof_pred_proba is None:
self._oof_pred_proba = oof_pred_proba
self._oof_pred_model_repeats = oof_pred_model_repeats
else:
self._oof_pred_proba += oof_pred_proba
self._oof_pred_model_repeats += oof_pred_model_repeats
self._n_repeats = n_repeats
if k_fold == k_fold_end:
self._k = None
self._k_fold_end = 0
self._n_repeats_finished = self._n_repeats
else:
self._k = k_fold
self._k_fold_end = k_fold_end
self._n_repeats_finished = self._n_repeats - 1
# TODO: Augment to generate OOF after shuffling each column in X (Batching), this is the fastest way.
# TODO: Reduce logging clutter during OOF importance calculation (Currently logs separately for each child)
# Generates OOF predictions from pre-trained bagged models, assuming X and y are in the same row order as used in .fit(X, y)
def compute_feature_importance(self,
X,
y,
features=None,
silent=False,
time_limit=None,
is_oof=False,
**kwargs) -> pd.DataFrame:
if features is None:
# FIXME: use FULL features (children can have different features)
features = self.load_child(model=self.models[0]).features
if not is_oof:
return super().compute_feature_importance(X, y, features=features, time_limit=time_limit, silent=silent, **kwargs)
fi_fold_list = []
model_index = 0
num_children = len(self.models)
if time_limit is not None:
time_limit_per_child = time_limit / num_children
else:
time_limit_per_child = None
if not silent:
logging_message = f'Computing feature importance via permutation shuffling for {len(features)} features using out-of-fold (OOF) data aggregated across {num_children} child models...'
if time_limit is not None:
logging_message = f'{logging_message} Time limit: {time_limit}s...'
logger.log(20, logging_message)
time_start = time.time()
early_stop = False
children_completed = 0
log_final_suffix = ''
for n_repeat, k in enumerate(self._k_per_n_repeat):
if is_oof:
if self._child_oof or not self._bagged_mode:
raise AssertionError('Model trained with no validation data cannot get feature importances on training data, please specify new test data to compute feature importances (model=%s)' % self.name)
kfolds = self._cv_splitters[n_repeat].split(X=X, y=y)
cur_kfolds = kfolds[n_repeat * k:(n_repeat + 1) * k]
else:
cur_kfolds = [(None, list(range(len(X))))] * k
for i, fold in enumerate(cur_kfolds):
_, test_index = fold
model = self.load_child(self.models[model_index + i])
fi_fold = model.compute_feature_importance(X=X.iloc[test_index, :], y=y.iloc[test_index], features=features, time_limit=time_limit_per_child,
silent=silent, log_prefix='\t', importance_as_list=True, **kwargs)
fi_fold_list.append(fi_fold)
children_completed += 1
if time_limit is not None and children_completed != num_children:
time_now = time.time()
time_left = time_limit - (time_now - time_start)
time_child_average = (time_now - time_start) / children_completed
if time_left < (time_child_average * 1.1):
log_final_suffix = f' (Early stopping due to lack of time...)'
early_stop = True
break
if early_stop:
break
model_index += k
# TODO: DON'T THROW AWAY SAMPLES! USE LARGER N
fi_list_dict = dict()
for val in fi_fold_list:
val = val['importance'].to_dict() # TODO: Don't throw away stddev information of children
for key in val:
if key not in fi_list_dict:
fi_list_dict[key] = []
fi_list_dict[key] += val[key]
fi_df = _compute_fi_with_stddev(fi_list_dict)
if not silent:
logger.log(20, f'\t{round(time.time() - time_start, 2)}s\t= Actual runtime (Completed {children_completed} of {num_children} children){log_final_suffix}')
return fi_df
def load_child(self, model, verbose=False) -> AbstractModel:
if isinstance(model, str):
child_path = self.create_contexts(self.path + model + os.path.sep)
return self._child_type.load(path=child_path, verbose=verbose)
else:
return model
def save_child(self, model, verbose=False):
child = self.load_child(model)
child.set_contexts(self.path + child.name + os.path.sep)
child.save(verbose=verbose)
# TODO: Multiply epochs/n_iterations by some value (such as 1.1) to account for having more training data than bagged models
def convert_to_refit_full_template(self):
init_args = self.get_params()
init_args['hyperparameters']['save_bag_folds'] = True # refit full models must save folds
init_args['model_base'] = self.convert_to_refit_full_template_child()
init_args['name'] = init_args['name'] + REFIT_FULL_SUFFIX
model_full_template = self.__class__(**init_args)
return model_full_template
def convert_to_refit_full_template_child(self):
refit_params_trained = self._get_compressed_params_trained()
refit_params = copy.deepcopy(self._get_model_base().get_params())
refit_params['hyperparameters'].update(refit_params_trained)
refit_child_template = self._child_type(**refit_params)
return refit_child_template
def get_params(self):
init_args = dict(
model_base=self._get_model_base(),
random_state=self._random_state,
)
init_args.update(super().get_params())
init_args.pop('eval_metric')
init_args.pop('problem_type')
return init_args
def _get_compressed_params(self, model_params_list=None):
if model_params_list is None:
model_params_list = [
self.load_child(child).get_trained_params()
for child in self.models
]
model_params_compressed = dict()
for param in model_params_list[0].keys():
model_param_vals = [model_params[param] for model_params in model_params_list]
if all(isinstance(val, bool) for val in model_param_vals):
counter = Counter(model_param_vals)
compressed_val = counter.most_common(1)[0][0]
elif all(isinstance(val, int) for val in model_param_vals):
compressed_val = round(mean(model_param_vals))
elif all(isinstance(val, float) for val in model_param_vals):
compressed_val = mean(model_param_vals)
else:
try:
counter = Counter(model_param_vals)
compressed_val = counter.most_common(1)[0][0]
except TypeError:
compressed_val = model_param_vals[0]
model_params_compressed[param] = compressed_val
return model_params_compressed
def _get_compressed_params_trained(self):
model_params_list = [
self.load_child(child).params_trained
for child in self.models
]
return self._get_compressed_params(model_params_list=model_params_list)
def _get_model_base(self):
if self.model_base is None:
return self.load_model_base()
else:
return self.model_base
def _add_child_times_to_bag(self, model):
if self.fit_time is None:
self.fit_time = model.fit_time
else:
self.fit_time += model.fit_time
if self.predict_time is None:
self.predict_time = model.predict_time
else:
self.predict_time += model.predict_time
@classmethod
def load(cls, path: str, reset_paths=True, low_memory=True, load_oof=False, verbose=True):
model = super().load(path=path, reset_paths=reset_paths, verbose=verbose)
if not low_memory:
model.persist_child_models(reset_paths=reset_paths)
if load_oof:
model._load_oof()
return model
@classmethod
def load_oof(cls, path, verbose=True):
try:
oof = load_pkl.load(path=path + 'utils' + os.path.sep + cls._oof_filename, verbose=verbose)
oof_pred_proba = oof['_oof_pred_proba']
oof_pred_model_repeats = oof['_oof_pred_model_repeats']
except FileNotFoundError:
model = cls.load(path=path, reset_paths=True, verbose=verbose)
model._load_oof()
oof_pred_proba = model._oof_pred_proba
oof_pred_model_repeats = model._oof_pred_model_repeats
return cls._oof_pred_proba_func(oof_pred_proba=oof_pred_proba, oof_pred_model_repeats=oof_pred_model_repeats)
def _load_oof(self):
if self._oof_pred_proba is not None:
pass
else:
oof = load_pkl.load(path=self.path + 'utils' + os.path.sep + self._oof_filename)
self._oof_pred_proba = oof['_oof_pred_proba']
self._oof_pred_model_repeats = oof['_oof_pred_model_repeats']
def persist_child_models(self, reset_paths=True):
for i, model_name in enumerate(self.models):
if isinstance(model_name, str):
child_path = self.create_contexts(self.path + model_name + os.path.sep)
child_model = self._child_type.load(path=child_path, reset_paths=reset_paths, verbose=True)
self.models[i] = child_model
def load_model_base(self):
return load_pkl.load(path=self.path + 'utils' + os.path.sep + 'model_template.pkl')
def save_model_base(self, model_base):
save_pkl.save(path=self.path + 'utils' + os.path.sep + 'model_template.pkl', object=model_base)
def save(self, path=None, verbose=True, save_oof=True, save_children=False) -> str:
if path is None:
path = self.path
if save_children:
model_names = []
for child in self.models:
child = self.load_child(child)
child.set_contexts(path + child.name + os.path.sep)
child.save(verbose=False)
model_names.append(child.name)
self.models = model_names
if save_oof and self._oof_pred_proba is not None:
save_pkl.save(path=path + 'utils' + os.path.sep + self._oof_filename, object={
'_oof_pred_proba': self._oof_pred_proba,
'_oof_pred_model_repeats': self._oof_pred_model_repeats,
})
self._oof_pred_proba = None
self._oof_pred_model_repeats = None
return super().save(path=path, verbose=verbose)
# If `remove_fit_stack=True`, variables will be removed that are required to fit more folds and to fit new stacker models which use this model as a base model.
# This includes OOF variables.
def reduce_memory_size(self, remove_fit_stack=False, remove_fit=True, remove_info=False, requires_save=True, reduce_children=False, **kwargs):
super().reduce_memory_size(remove_fit=remove_fit, remove_info=remove_info, requires_save=requires_save, **kwargs)
if remove_fit_stack:
try:
os.remove(self.path + 'utils' + os.path.sep + self._oof_filename)
except FileNotFoundError:
pass
if requires_save:
self._oof_pred_proba = None
self._oof_pred_model_repeats = None
try:
os.remove(self.path + 'utils' + os.path.sep + 'model_template.pkl')
except FileNotFoundError:
pass
if requires_save:
self.model_base = None
try:
os.rmdir(self.path + 'utils')
except OSError:
pass
if reduce_children:
for model in self.models:
model = self.load_child(model)
model.reduce_memory_size(remove_fit=remove_fit, remove_info=remove_info, requires_save=requires_save, **kwargs)
if requires_save and self.low_memory:
self.save_child(model=model)
def _get_model_names(self):
model_names = []
for model in self.models:
if isinstance(model, str):
model_names.append(model)
else:
model_names.append(model.name)
return model_names
def get_info(self):
info = super().get_info()
children_info = self._get_child_info()
child_memory_sizes = [child['memory_size'] for child in children_info.values()]
sum_memory_size_child = sum(child_memory_sizes)
if child_memory_sizes:
max_memory_size_child = max(child_memory_sizes)
else:
max_memory_size_child = 0
if self.low_memory:
max_memory_size = info['memory_size'] + sum_memory_size_child
min_memory_size = info['memory_size'] + max_memory_size_child
else:
max_memory_size = info['memory_size']
min_memory_size = info['memory_size'] - sum_memory_size_child + max_memory_size_child
# Necessary if save_space is used as save_space deletes model_base.
if len(self.models) > 0:
child_model = self.load_child(self.models[0])
else:
child_model = self._get_model_base()
child_hyperparameters = child_model.params
child_ag_args_fit = child_model.params_aux
bagged_info = dict(
child_model_type=self._child_type.__name__,
num_child_models=len(self.models),
child_model_names=self._get_model_names(),
_n_repeats=self._n_repeats,
# _n_repeats_finished=self._n_repeats_finished, # commented out because these are too technical
# _k_fold_end=self._k_fold_end,
# _k=self._k,
_k_per_n_repeat=self._k_per_n_repeat,
_random_state=self._random_state,
low_memory=self.low_memory, # If True, then model will attempt to use at most min_memory_size memory by having at most one child in memory. If False, model will use max_memory_size memory.
bagged_mode=self._bagged_mode,
max_memory_size=max_memory_size, # Memory used when all children are loaded into memory at once.
min_memory_size=min_memory_size, # Memory used when only the largest child is loaded into memory.
child_hyperparameters=child_hyperparameters,
child_hyperparameters_fit=self._get_compressed_params_trained(),
child_ag_args_fit=child_ag_args_fit,
)
info['bagged_info'] = bagged_info
info['children_info'] = children_info
child_features_full = list(set().union(*[child['features'] for child in children_info.values()]))
info['features'] = child_features_full
info['num_features'] = len(child_features_full)
return info
def get_memory_size(self):
models = self.models
self.models = None
memory_size = super().get_memory_size()
self.models = models
return memory_size
def _get_child_info(self):
child_info_dict = dict()
for model in self.models:
if isinstance(model, str):
child_path = self.create_contexts(self.path + model + os.path.sep)
child_info_dict[model] = self._child_type.load_info(child_path)
else:
child_info_dict[model.name] = model.get_info()
return child_info_dict
def _construct_empty_oof(self, X, y):
if self.problem_type == MULTICLASS:
oof_pred_proba = np.zeros(shape=(len(X), len(y.unique())), dtype=np.float32)
elif self.problem_type == SOFTCLASS:
oof_pred_proba = np.zeros(shape=y.shape, dtype=np.float32)
elif self.problem_type == QUANTILE:
oof_pred_proba = np.zeros(shape=(len(X), len(self.quantile_levels)), dtype=np.float32)
else:
oof_pred_proba = np.zeros(shape=len(X), dtype=np.float32)
oof_pred_model_repeats = np.zeros(shape=len(X), dtype=np.uint8)
return oof_pred_proba, oof_pred_model_repeats
def _preprocess_fit_resources(self, silent=False, **kwargs):
"""Pass along to child models to avoid altering up-front"""
return kwargs
# TODO: Currently double disk usage, saving model in HPO and also saving model in bag
# FIXME: with use_bag_holdout=True, the fold-1 scores that are logged are of the inner validation score, not the holdout score.
# Fix this by passing X_val, y_val into this method
def _hyperparameter_tune(self, X, y, k_fold, scheduler_options, preprocess_kwargs=None, groups=None, **kwargs):
if len(self.models) != 0:
raise ValueError('self.models must be empty to call hyperparameter_tune, value: %s' % self.models)
kwargs['feature_metadata'] = self.feature_metadata
kwargs['num_classes'] = self.num_classes # TODO: maybe don't pass num_classes to children
self.model_base.set_contexts(self.path + 'hpo' + os.path.sep)
# TODO: Preprocess data here instead of repeatedly
if preprocess_kwargs is None:
preprocess_kwargs = dict()
use_child_oof = self.params.get('use_child_oof', False)
X = self.preprocess(X=X, preprocess=False, fit=True, **preprocess_kwargs)
if use_child_oof:
k_fold = 1
X_fold = X
y_fold = y
X_val_fold = None
y_val_fold = None
train_index = list(range(len(X)))
test_index = train_index
cv_splitter = None
else:
cv_splitter = self._get_cv_splitter(n_splits=k_fold, n_repeats=1, groups=groups)
if k_fold != cv_splitter.n_splits:
k_fold = cv_splitter.n_splits
kfolds = cv_splitter.split(X=X, y=y)
train_index, test_index = kfolds[0]
X_fold, X_val_fold = X.iloc[train_index, :], X.iloc[test_index, :]
y_fold, y_val_fold = y.iloc[train_index], y.iloc[test_index]
orig_time = scheduler_options[1]['time_out']
if orig_time:
scheduler_options[1]['time_out'] = orig_time * 0.8 # TODO: Scheduler doesn't early stop on final model, this is a safety net. Scheduler should be updated to early stop
hpo_models, hpo_model_performances, hpo_results = self.model_base.hyperparameter_tune(X=X_fold, y=y_fold, X_val=X_val_fold, y_val=y_val_fold, scheduler_options=scheduler_options, **kwargs)
scheduler_options[1]['time_out'] = orig_time
bags = {}
bags_performance = {}
for i, (model_name, model_path) in enumerate(hpo_models.items()):
child: AbstractModel = self._child_type.load(path=model_path)
# TODO: Create new Ensemble Here
bag = copy.deepcopy(self)
bag.rename(f"{bag.name}{os.path.sep}T{i}")
bag.set_contexts(self.path_root + bag.name + os.path.sep)
oof_pred_proba, oof_pred_model_repeats = self._construct_empty_oof(X=X, y=y)
if child._get_tags().get('valid_oof', False):
y_pred_proba = child.get_oof_pred_proba(X=X, y=y)
bag._n_repeats_finished = 1
bag._k_per_n_repeat = [1]
bag._bagged_mode = False
bag._child_oof = True # TODO: Consider a separate tag for refit_folds vs efficient OOF
else:
y_pred_proba = child.predict_proba(X_val_fold)
oof_pred_proba[test_index] += y_pred_proba
oof_pred_model_repeats[test_index] += 1
bag.model_base = None
child.rename('')
child.set_contexts(bag.path + child.name + os.path.sep)
bag.save_model_base(child.convert_to_template())
bag._k = k_fold
bag._k_fold_end = 1
bag._n_repeats = 1
bag._oof_pred_proba = oof_pred_proba
bag._oof_pred_model_repeats = oof_pred_model_repeats
child.rename('S1F1')
child.set_contexts(bag.path + child.name + os.path.sep)
if not self.params.get('save_bag_folds', True):
child.model = None
if bag.low_memory:
bag.save_child(child, verbose=False)
bag.models.append(child.name)
else:
bag.models.append(child)
bag.val_score = child.val_score
bag._add_child_times_to_bag(model=child)
if cv_splitter is not None:
bag._cv_splitters = [cv_splitter]
bag.save()
bags[bag.name] = bag.path
bags_performance[bag.name] = bag.val_score
# TODO: hpo_results likely not correct because no renames
return bags, bags_performance, hpo_results
def _more_tags(self):
return {'valid_oof': True}