Source code for autogluon.tabular.models.xgboost.xgboost_model

import os
import time
import logging

import psutil

from autogluon.core.constants import MULTICLASS, REGRESSION, SOFTCLASS, PROBLEM_TYPES_CLASSIFICATION
from autogluon.core.features.types import R_OBJECT
from autogluon.core.models import AbstractModel
from autogluon.core.models._utils import get_early_stopping_rounds
from autogluon.core.utils import try_import_xgboost
from autogluon.core.utils.exceptions import NotEnoughMemoryError

from . import xgboost_utils
from .hyperparameters.parameters import get_param_baseline
from .hyperparameters.searchspaces import get_default_searchspace

logger = logging.getLogger(__name__)


[docs]class XGBoostModel(AbstractModel): """ XGBoost model: https://xgboost.readthedocs.io/en/latest/ Hyperparameter options: https://xgboost.readthedocs.io/en/latest/parameter.html """ def __init__(self, **kwargs): super().__init__(**kwargs) self._ohe_generator = None def _set_default_params(self): default_params = get_param_baseline(problem_type=self.problem_type, num_classes=self.num_classes) for param, val in default_params.items(): self._set_default_param_value(param, val) def _get_default_searchspace(self): return get_default_searchspace(problem_type=self.problem_type, num_classes=self.num_classes) def _get_default_auxiliary_params(self) -> dict: default_auxiliary_params = super()._get_default_auxiliary_params() extra_auxiliary_params = dict( ignored_type_group_raw=[R_OBJECT], ) default_auxiliary_params.update(extra_auxiliary_params) return default_auxiliary_params # Use specialized XGBoost metric if available (fast), otherwise use custom func generator def get_eval_metric(self): eval_metric = xgboost_utils.convert_ag_metric_to_xgbm(ag_metric_name=self.stopping_metric.name, problem_type=self.problem_type) if eval_metric is None: eval_metric = xgboost_utils.func_generator(metric=self.stopping_metric, is_higher_better=True, needs_pred_proba=not self.stopping_metric.needs_pred, problem_type=self.problem_type) return eval_metric def _preprocess(self, X, is_train=False, max_category_levels=None, **kwargs): X = super()._preprocess(X=X, **kwargs) if self._ohe_generator is None: self._ohe_generator = xgboost_utils.OheFeatureGenerator(max_levels=max_category_levels) if is_train: self._ohe_generator.fit(X) X = self._ohe_generator.transform(X) return X def _fit(self, X, y, X_val=None, y_val=None, time_limit=None, num_gpus=0, sample_weight=None, sample_weight_val=None, verbosity=2, **kwargs): # TODO: utilize sample_weight_val in early-stopping if provided start_time = time.time() ag_params = self._get_ag_params() params = self._get_model_params() max_category_levels = params.pop('proc.max_category_levels', 100) if verbosity <= 2: verbose = False verbose_eval = None elif verbosity == 3: verbose = True verbose_eval = 50 else: verbose = True verbose_eval = 1 self._assert_memory_safe(X=X, y=y) X = self.preprocess(X, is_train=True, max_category_levels=max_category_levels) num_rows_train = X.shape[0] eval_set = [] eval_metric = self.get_eval_metric() if X_val is None: early_stopping_rounds = None eval_set.append((X, y)) # TODO: if the train dataset is large, use sample of train dataset for validation else: X_val = self.preprocess(X_val, is_train=False) eval_set.append((X_val, y_val)) early_stopping_rounds = ag_params.get('ag.early_stop', 'auto') if isinstance(early_stopping_rounds, str): early_stopping_rounds = self._get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=early_stopping_rounds) if num_gpus != 0: params['tree_method'] = 'gpu_hist' if 'gpu_id' not in params: params['gpu_id'] = 0 elif 'tree_method' not in params: params['tree_method'] = 'hist' try_import_xgboost() from .callbacks import EarlyStoppingCustom from xgboost.callback import EvaluationMonitor callbacks = [] if verbose_eval is not None: callbacks.append(EvaluationMonitor(period=verbose_eval)) # TODO: disable early stopping during refit_full callbacks.append(EarlyStoppingCustom(early_stopping_rounds, start_time=start_time, time_limit=time_limit, verbose=verbose)) from xgboost import XGBClassifier, XGBRegressor model_type = XGBClassifier if self.problem_type in PROBLEM_TYPES_CLASSIFICATION else XGBRegressor self.model = model_type(**params) self.model.fit( X=X, y=y, eval_set=eval_set, eval_metric=eval_metric, verbose=False, callbacks=callbacks, sample_weight=sample_weight ) bst = self.model.get_booster() # TODO: Investigate speed-ups from GPU inference # bst.set_param({"predictor": "gpu_predictor"}) self.params_trained['n_estimators'] = bst.best_ntree_limit def _predict_proba(self, X, **kwargs): X = self.preprocess(X, **kwargs) if self.problem_type == REGRESSION: return self.model.predict(X) y_pred_proba = self.model.predict_proba(X) return self._convert_proba_to_unified_form(y_pred_proba) def _get_early_stopping_rounds(self, num_rows_train, strategy='auto'): return get_early_stopping_rounds(num_rows_train=num_rows_train, strategy=strategy) def _ag_params(self) -> set: return {'ag.early_stop'} # FIXME: This is copy-pasted from CatBoostModel, make a generic memory check method / function and re-use. def _assert_memory_safe(self, X, y): num_rows_train = X.shape[0] num_cols_train = X.shape[1] if self.problem_type == MULTICLASS: if self.num_classes is not None: num_classes = self.num_classes else: num_classes = 10 # Guess if not given, can do better by looking at y elif self.problem_type == SOFTCLASS: # TODO: delete this elif if it's unnecessary. num_classes = y.shape[1] else: num_classes = 1 max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio'] approx_mem_size_req = num_rows_train * num_cols_train * num_classes / 2 # TODO: Extremely crude approximation, can be vastly improved if approx_mem_size_req > 1e9: # > 1 GB available_mem = psutil.virtual_memory().available ratio = approx_mem_size_req / available_mem if ratio > (1 * max_memory_usage_ratio): logger.warning('\tWarning: Not enough memory to safely train XGBoost model, roughly requires: %s GB, but only %s GB is available...' % (round(approx_mem_size_req / 1e9, 3), round(available_mem / 1e9, 3))) raise NotEnoughMemoryError elif ratio > (0.2 * max_memory_usage_ratio): logger.warning('\tWarning: Potentially not enough memory to safely train XGBoost model, roughly requires: %s GB, but only %s GB is available...' % (round(approx_mem_size_req / 1e9, 3), round(available_mem / 1e9, 3)))