import logging
import math
import os
import pickle
import sys
import time
import numpy as np
from autogluon.common.features.types import R_BOOL, R_CATEGORY, R_FLOAT, R_INT
from autogluon.common.utils.resource_utils import ResourceManager
from autogluon.core.constants import MULTICLASS, QUANTILE, REGRESSION, SOFTCLASS
from autogluon.core.models import AbstractModel
from autogluon.core.utils.exceptions import NotEnoughMemoryError, TimeLimitExceeded
from autogluon.core.utils.utils import normalize_pred_probas
from autogluon.features.generators import LabelEncoderFeatureGenerator
from .compilers.native import RFNativeCompiler
from .compilers.onnx import RFOnnxCompiler
logger = logging.getLogger(__name__)
[docs]
class RFModel(AbstractModel):
"""
Random Forest model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._feature_generator = None
self._daal = False # Whether daal4py backend is being used
self._num_features_post_process = None
def _get_model_type(self):
if self.problem_type == QUANTILE:
from .rf_quantile import RandomForestQuantileRegressor
return RandomForestQuantileRegressor
if self.params_aux.get("use_daal", False):
# Disabled by default because OOB score does not yet work properly
try:
# FIXME: sklearnex OOB score is broken, returns biased predictions. Without this optimization, can't compute Efficient OOF.
# Refer to https://github.com/intel/scikit-learn-intelex/issues/933
# Current workaround: Forcibly set oob_score=True during fit to compute OOB during train time.
# Downsides:
# 1. Slows down training slightly by forcing computation of OOB even if OOB is not needed (such as in medium_quality)
# 2. Makes computing the correct pred_time_val difficult, as the time is instead added to the fit_time,
# and we would need to waste extra time to compute the proper pred_time_val post-fit.
# Therefore with sklearnex enabled, pred_time_val is incorrect.
from sklearnex.ensemble import RandomForestClassifier, RandomForestRegressor
logger.log(15, "\tUsing sklearnex RF backend...")
self._daal = True
except:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
self._daal = False
else:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
self._daal = False
if self.problem_type in [REGRESSION, SOFTCLASS]:
return RandomForestRegressor
else:
return RandomForestClassifier
# TODO: X.fillna -inf? Add extra is_missing column?
def _preprocess(self, X, **kwargs):
X = super()._preprocess(X, **kwargs)
if self._feature_generator is None:
self._feature_generator = LabelEncoderFeatureGenerator(verbosity=0)
self._feature_generator.fit(X=X)
if self._feature_generator.features_in:
X = X.copy()
X[self._feature_generator.features_in] = self._feature_generator.transform(X=X)
X = X.fillna(0).to_numpy(dtype=np.float32)
return X
def _set_default_params(self):
default_params = {
# TODO: 600 is much better, but increases info leakage in stacking -> therefore 300 is ~equal in stack ensemble final quality.
# Consider adding targeted noise to OOF to avoid info leakage, or increase `min_samples_leaf`.
"n_estimators": 300,
# Cap leaf nodes to 15000 to avoid large datasets using unreasonable amounts of memory/disk for RF/XT.
# Ensures that memory and disk usage of RF model with 300 n_estimators is at most ~500 MB for binary/regression, ~200 MB per class for multiclass.
# This has no effect on datasets with <=15000 rows, and minimal to no impact on datasets with <50000 rows.
# For large datasets, will often make the model worse, but will significantly speed up inference speed and massively reduce memory and disk usage.
# For example, when left uncapped, RF can use 5 GB of disk for a regression dataset with 2M rows.
# Multiply by the 8 RF/XT models in config for best quality / high quality and this is 40 GB of tree models, which is unreasonable.
# This size scales linearly with number of rows.
"max_leaf_nodes": 15000,
"n_jobs": -1,
"random_state": 0,
"bootstrap": True, # Required for OOB estimates, setting to False will raise exception if bagging.
# TODO: min_samples_leaf=5 is too large on most problems, however on some datasets it helps a lot (airlines likes >40 min_samples_leaf, adult likes 2 much better than 1)
# This value would need to be tuned per dataset, likely very worthwhile.
# Higher values = less OOF info leak, default = 1, which maximizes info leak.
# 'min_samples_leaf': 5, # Significantly reduces info leakage to stacker models. Never use the default/1 when using as base model.
# 'oob_score': True, # Disabled by default as it is better to do it post-fit via custom logic.
}
for param, val in default_params.items():
self._set_default_param_value(param, val)
# TODO: Add in documentation that Categorical default is the first index
# TODO: enable HPO for RF models
def _get_default_searchspace(self):
spaces = {
# 'n_estimators': Int(lower=10, upper=1000, default=300),
# 'max_features': Categorical(['auto', 0.5, 0.25]),
# 'criterion': Categorical(['gini', 'entropy']),
}
return spaces
def _get_num_trees_per_estimator(self):
# Very rough guess to size of a single tree before training
if self.problem_type in [MULTICLASS, SOFTCLASS]:
if self.num_classes is None:
num_trees_per_estimator = 10 # Guess since it wasn't passed in, could also check y for a better value
else:
num_trees_per_estimator = self.num_classes
else:
num_trees_per_estimator = 1
return num_trees_per_estimator
def _estimate_memory_usage(self, X, **kwargs):
params = self._get_model_params()
n_estimators_final = params["n_estimators"]
if isinstance(n_estimators_final, int):
n_estimators_minimum = min(40, n_estimators_final)
else: # if search space
n_estimators_minimum = 40
num_trees_per_estimator = self._get_num_trees_per_estimator()
bytes_per_estimator = num_trees_per_estimator * len(X) / 60000 * 1e6 # Underestimates by 3x on ExtraTrees
expected_min_memory_usage = bytes_per_estimator * n_estimators_minimum
return expected_min_memory_usage
def _validate_fit_memory_usage(self, mem_error_threshold: float = 0.5, mem_warning_threshold: float = 0.4, mem_size_threshold: int = 1e7, **kwargs):
return super()._validate_fit_memory_usage(
mem_error_threshold=mem_error_threshold, mem_warning_threshold=mem_warning_threshold, mem_size_threshold=mem_size_threshold, **kwargs
)
def _expected_mem_usage(self, n_estimators_final, bytes_per_estimator):
available_mem = ResourceManager.get_available_virtual_mem()
return n_estimators_final * bytes_per_estimator / available_mem
def _fit(self, X, y, num_cpus=-1, time_limit=None, sample_weight=None, **kwargs):
time_start = time.time()
model_cls = self._get_model_type()
max_memory_usage_ratio = self.params_aux["max_memory_usage_ratio"]
params = self._get_model_params()
if "n_jobs" not in params:
params["n_jobs"] = num_cpus
n_estimators_final = params["n_estimators"]
n_estimators_minimum = min(40, n_estimators_final)
n_estimators_test = min(4, max(1, math.floor(n_estimators_minimum / 5)))
X = self.preprocess(X)
n_estimator_increments = [n_estimators_final]
num_trees_per_estimator = self._get_num_trees_per_estimator()
bytes_per_estimator = num_trees_per_estimator * len(X) / 60000 * 1e6 # Underestimates by 3x on ExtraTrees
expected_memory_usage = self._expected_mem_usage(n_estimators_final, bytes_per_estimator)
if n_estimators_final > n_estimators_test * 2:
if self.problem_type == MULTICLASS:
n_estimator_increments = [n_estimators_test, n_estimators_final]
params["warm_start"] = True
else:
if expected_memory_usage > (0.05 * max_memory_usage_ratio): # Somewhat arbitrary, consider finding a better value, should it scale by cores?
# Causes ~10% training slowdown, so try to avoid if memory is not an issue
n_estimator_increments = [n_estimators_test, n_estimators_final]
params["warm_start"] = True
params["n_estimators"] = n_estimator_increments[0]
if self._daal:
if params.get("warm_start", False):
params["warm_start"] = False
# FIXME: This is inefficient but sklearnex doesn't support computing oob_score after training
params["oob_score"] = True
model = model_cls(**params)
time_train_start = time.time()
for i, n_estimators in enumerate(n_estimator_increments):
if i != 0:
if params.get("warm_start", False):
model.n_estimators = n_estimators
else:
params["n_estimators"] = n_estimators
model = model_cls(**params)
model = model.fit(X, y, sample_weight=sample_weight)
if (i == 0) and (len(n_estimator_increments) > 1):
time_elapsed = max(time.time() - time_train_start, 0.001) # avoid it being too small and being truncated to 0
model_size_bytes = 0
for estimator in model.estimators_: # Uses far less memory than pickling the entire forest at once
model_size_bytes += sys.getsizeof(pickle.dumps(estimator))
expected_final_model_size_bytes = model_size_bytes * (n_estimators_final / model.n_estimators)
available_mem = ResourceManager.get_available_virtual_mem()
model_memory_ratio = expected_final_model_size_bytes / available_mem
ideal_memory_ratio = 0.15 * max_memory_usage_ratio
n_estimators_ideal = min(n_estimators_final, math.floor(ideal_memory_ratio / model_memory_ratio * n_estimators_final))
if n_estimators_final > n_estimators_ideal:
if n_estimators_ideal < n_estimators_minimum:
logger.warning(f"\tWarning: Model is expected to require {round(model_memory_ratio*100, 2)}% of available memory...")
raise NotEnoughMemoryError # don't train full model to avoid OOM error
logger.warning(
f"\tWarning: Reducing model 'n_estimators' from {n_estimators_final} -> {n_estimators_ideal} due to low memory. Expected memory usage reduced from {round(model_memory_ratio*100, 2)}% -> {round(ideal_memory_ratio*100, 2)}% of available memory..."
)
if time_limit is not None:
time_expected = time_train_start - time_start + (time_elapsed * n_estimators_ideal / n_estimators)
n_estimators_time = math.floor((time_limit - time_train_start + time_start) * n_estimators / time_elapsed)
if n_estimators_time < n_estimators_ideal:
if n_estimators_time < n_estimators_minimum:
logger.warning(
f"\tWarning: Model is expected to require {round(time_expected, 1)}s to train, which exceeds the maximum time limit of {round(time_limit, 1)}s, skipping model..."
)
raise TimeLimitExceeded
logger.warning(
f"\tWarning: Reducing model 'n_estimators' from {n_estimators_ideal} -> {n_estimators_time} due to low time. Expected time usage reduced from {round(time_expected, 1)}s -> {round(time_limit, 1)}s..."
)
n_estimators_ideal = n_estimators_time
for j in range(len(n_estimator_increments)):
if n_estimator_increments[j] > n_estimators_ideal:
n_estimator_increments[j] = n_estimators_ideal
if self._daal and model.criterion != "entropy":
# TODO: entropy is not accelerated by sklearnex, need to not set estimators_ to None to avoid crash
# This reduces memory usage / disk usage.
model.estimators_ = None
self.model = model
self.params_trained["n_estimators"] = self.model.n_estimators
# TODO: Remove this after simplifying _predict_proba to reduce code duplication. This is only present for SOFTCLASS support.
def _predict_proba(self, X, **kwargs):
X = self.preprocess(X, **kwargs)
if self.problem_type == REGRESSION:
return self.model.predict(X)
elif self.problem_type == SOFTCLASS:
return self.model.predict(X)
elif self.problem_type == QUANTILE:
return self.model.predict(X, quantile_levels=self.quantile_levels)
y_pred_proba = self.model.predict_proba(X)
return self._convert_proba_to_unified_form(y_pred_proba)
def predict_proba_oof(self, X, normalize=None, **kwargs):
"""X should be the same X passed to `.fit`"""
y_oof_pred_proba = self._predict_proba_oof(X=X, **kwargs)
if normalize is None:
normalize = self.normalize_pred_probas
if normalize:
y_oof_pred_proba = normalize_pred_probas(y_oof_pred_proba, self.problem_type)
y_oof_pred_proba = y_oof_pred_proba.astype(np.float32)
return y_oof_pred_proba
def _is_sklearn_1(self) -> bool:
"""Returns True if the trained model is from sklearn>=1.0"""
return callable(getattr(self.model, "_set_oob_score_and_attributes", None))
def _model_supports_oob_pred_proba(self) -> bool:
"""Returns True if model supports computing out-of-bag prediction probabilities"""
# TODO: Remove `_set_oob_score` after sklearn version requirement is >=1.0
return callable(getattr(self.model, "_set_oob_score", None)) or self._is_sklearn_1()
# FIXME: Unknown if this works with quantile regression
def _predict_proba_oof(self, X, y, **kwargs):
if not self.model.bootstrap:
raise ValueError("Forest models must set `bootstrap=True` to compute out-of-fold predictions via out-of-bag predictions.")
oob_is_not_set = getattr(self.model, "oob_decision_function_", None) is None and getattr(self.model, "oob_prediction_", None) is None
if oob_is_not_set and self._daal:
raise AssertionError("DAAL forest backend does not support out-of-bag predictions.")
# TODO: This can also be done via setting `oob_score=True` in model params,
# but getting the correct `pred_time_val` that way is not easy, since we can't time the internal call.
if oob_is_not_set and self._model_supports_oob_pred_proba():
X = self.preprocess(X)
if getattr(self.model, "n_classes_", None) is not None:
if self.model.n_outputs_ == 1:
self.model.n_classes_ = [self.model.n_classes_]
from sklearn.tree._tree import DOUBLE, DTYPE
X, y = self.model._validate_data(X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE)
if y.ndim == 1:
# reshape is necessary to preserve the data contiguity against vs
# [:, np.newaxis] that does not.
y = np.reshape(y, (-1, 1))
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
y = np.ascontiguousarray(y, dtype=DOUBLE)
if self._is_sklearn_1():
# sklearn >= 1.0
# TODO: Can instead do `_compute_oob_predictions` but requires post-processing. Skips scoring func.
self.model._set_oob_score_and_attributes(X, y)
else:
# sklearn < 1.0
# TODO: Remove once sklearn < 1.0 support is dropped
self.model._set_oob_score(X, y)
if getattr(self.model, "n_classes_", None) is not None:
if self.model.n_outputs_ == 1:
self.model.n_classes_ = self.model.n_classes_[0]
if getattr(self.model, "oob_decision_function_", None) is not None:
y_oof_pred_proba = self.model.oob_decision_function_
self.model.oob_decision_function_ = None # save memory
elif getattr(self.model, "oob_prediction_", None) is not None:
y_oof_pred_proba = self.model.oob_prediction_
self.model.oob_prediction_ = None # save memory
else:
raise AssertionError(f"Model class {type(self.model)} does not support out-of-fold prediction generation.")
# TODO: Regression does not return NaN for missing rows, instead it sets them to 0. This makes life hard.
# The below code corrects the missing rows to NaN instead of 0.
# Don't bother if >60 trees, near impossible to have missing
# If using 68% of data for training, chance of missing for each row is 1 in 11 billion.
if self.problem_type == REGRESSION and self.model.n_estimators <= 60:
from sklearn.ensemble._forest import _generate_unsampled_indices, _get_n_samples_bootstrap
n_samples = len(y)
n_predictions = np.zeros(n_samples)
n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.model.max_samples)
for estimator in self.model.estimators_:
unsampled_indices = _generate_unsampled_indices(estimator.random_state, n_samples, n_samples_bootstrap)
n_predictions[unsampled_indices] += 1
missing_row_mask = n_predictions == 0
y_oof_pred_proba[missing_row_mask] = np.nan
# fill missing prediction rows with average of non-missing rows
if np.isnan(np.sum(y_oof_pred_proba)):
if len(y_oof_pred_proba.shape) == 1:
col_mean = np.nanmean(y_oof_pred_proba)
y_oof_pred_proba[np.isnan(y_oof_pred_proba)] = col_mean
else:
col_mean = np.nanmean(y_oof_pred_proba, axis=0)
inds = np.where(np.isnan(y_oof_pred_proba))
y_oof_pred_proba[inds] = np.take(col_mean, inds[1])
return self._convert_proba_to_unified_form(y_oof_pred_proba)
def _get_default_auxiliary_params(self) -> dict:
default_auxiliary_params = super()._get_default_auxiliary_params()
extra_auxiliary_params = dict(
valid_raw_types=[R_BOOL, R_INT, R_FLOAT, R_CATEGORY],
)
default_auxiliary_params.update(extra_auxiliary_params)
return default_auxiliary_params
@classmethod
def _get_default_ag_args_ensemble(cls, problem_type=None, **kwargs) -> dict:
default_ag_args_ensemble = super()._get_default_ag_args_ensemble(problem_type=problem_type, **kwargs)
if problem_type != QUANTILE: # use_child_oof not supported in quantile regression
extra_ag_args_ensemble = {"use_child_oof": True}
default_ag_args_ensemble.update(extra_ag_args_ensemble)
return default_ag_args_ensemble
def _more_tags(self):
# `can_refit_full=True` because final n_estimators is communicated at end of `_fit`:
# `self.params_trained['n_estimators'] = self.model.n_estimators`
tags = {"can_refit_full": True}
if self.problem_type == QUANTILE:
tags["valid_oof"] = False # not supported in quantile regression
else:
tags["valid_oof"] = True
return tags
def _valid_compilers(self):
return [RFNativeCompiler, RFOnnxCompiler]
def _default_compiler(self):
return RFNativeCompiler