"""
Code Adapted from TabArena: https://github.com/autogluon/tabarena/blob/main/tabarena/tabarena/benchmark/models/ag/tabicl/tabicl_model.py
"""
from __future__ import annotations
import logging
import pandas as pd
from autogluon.common.utils.pandas_utils import get_approximate_df_mem_usage
from autogluon.common.utils.resource_utils import ResourceManager
from autogluon.tabular import __version__
from autogluon.tabular.models.abstract.abstract_torch_model import AbstractTorchModel
logger = logging.getLogger(__name__)
# TODO: Verify if crashes when weights are not yet downloaded and fit in parallel
[docs]
class TabICLModel(AbstractTorchModel):
"""
TabICL is a foundation model for tabular data using in-context learning
that is scalable to larger datasets than TabPFNv2. It is pretrained purely on synthetic data.
The default TabICL version used is TabICLv2.
TabICL is one of the top performing methods overall on TabArena-v0.1: https://tabarena.ai
TabICLv2 significantly improves upon TabICLv1, and achieves very strong performance on TabArena.
Paper: TabICL: A Tabular Foundation Model for In-Context Learning on Large Data
Authors: Jingang Qu, David Holzmüller, Gaël Varoquaux, Marine Le Morvan
Paper: TabICLv2: A better, faster, scalable, and open tabular foundation model
Authors: Jingang Qu, David Holzmüller, Gaël Varoquaux, Marine Le Morvan
Codebase: https://github.com/soda-inria/tabicl
License: BSD-3-Clause
.. versionadded:: 1.4.0
"""
ag_key = "TABICL"
ag_name = "TabICL"
default_classification_model: str | None = "tabicl-classifier-v2-20260212.ckpt"
default_regression_model: str | None = "tabicl-regressor-v2-20260212.ckpt"
ag_priority = 65
seed_name = "random_state"
def get_model_cls(self):
if self.problem_type in ["binary", "multiclass"]:
from tabicl import TabICLClassifier
model_cls = TabICLClassifier
else:
from tabicl import TabICLRegressor
model_cls = TabICLRegressor
return model_cls
@staticmethod
def _get_batch_size(n_cells: int):
if n_cells <= 4_000_000:
return 8
elif n_cells <= 6_000_000:
return 4
else:
return 2
def get_checkpoint_version(self, hyperparameter: dict) -> str:
clf_checkpoint = self.default_classification_model
reg_checkpoint = self.default_regression_model
# Resolve HPO
if "checkpoint_version" in hyperparameter:
if isinstance(hyperparameter["checkpoint_version"], str):
clf_checkpoint = hyperparameter["checkpoint_version"]
reg_checkpoint = hyperparameter["checkpoint_version"]
elif isinstance(hyperparameter["checkpoint_version"], tuple):
clf_checkpoint = hyperparameter["checkpoint_version"][0]
reg_checkpoint = hyperparameter["checkpoint_version"][1]
else:
raise ValueError(
"checkpoint_version hyperparameter must be either a string or a tuple of two strings (clf, reg)."
)
if self.problem_type in ["binary", "multiclass"]:
return clf_checkpoint
return reg_checkpoint
def _fit(
self,
X: pd.DataFrame,
y: pd.Series,
num_cpus: int = 1,
num_gpus: int = 0,
**kwargs,
):
try:
import tabicl
except ImportError as err:
logger.log(
40,
f"\tFailed to import tabicl! To use the TabICL model, "
f"do: `pip install autogluon.tabular[tabicl]=={__version__}`.",
)
raise err
from torch.cuda import is_available
device = "cuda" if num_gpus != 0 else "cpu"
if (device == "cuda") and (not is_available()):
# FIXME: warn instead and switch to CPU.
raise AssertionError(
"Fit specified to use GPU, but CUDA is not available on this machine. "
"Please switch to CPU usage instead.",
)
model_cls = self.get_model_cls()
hyp = self._get_model_params()
hyp["batch_size"] = hyp.get("batch_size", self._get_batch_size(X.shape[0] * X.shape[1]))
self.model = model_cls(
**hyp,
device=device,
n_jobs=num_cpus,
)
X = self.preprocess(X, y=y)
self.model = self.model.fit(
X=X,
y=y,
)
def get_device(self) -> str:
return self.model.device_.type
# TODO: Better to have an official TabICL method for this
def _set_device(self, device: str):
device = self.to_torch_device(device)
self.model.device_ = device
self.model.device = self.model.device_.type
self.model.model_ = self.model.model_.to(self.model.device_)
self.model.inference_config_.COL_CONFIG.device = self.model.device_
self.model.inference_config_.ROW_CONFIG.device = self.model.device_
self.model.inference_config_.ICL_CONFIG.device = self.model.device_
def _get_default_auxiliary_params(self) -> dict:
default_auxiliary_params = super()._get_default_auxiliary_params()
default_auxiliary_params.update(
{
# TODO: Instead of caps, should we subsample for large datasets?
"max_rows": 1000000, # TODO: What should be the cap? 1 million rows works, but unsure if it is good
"max_features": 2000, # TODO: What should be the cap? 10k features works, but unsure if it is good
"max_batch_size": 1024, # avoid excessive VRAM usage
}
)
return default_auxiliary_params
@classmethod
def supported_problem_types(cls) -> list[str] | None:
return ["binary", "multiclass", "regression"]
def _get_default_resources(self) -> tuple[int, int]:
# Use only physical cores for better performance based on benchmarks
num_cpus = ResourceManager.get_cpu_count(only_physical_cores=True)
num_gpus = min(1, ResourceManager.get_gpu_count_torch(cuda_only=True))
return num_cpus, num_gpus
def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
hyperparameters = self._get_model_params()
return self.estimate_memory_usage_static(
X=X,
problem_type=self.problem_type,
num_classes=self.num_classes,
hyperparameters=hyperparameters,
**kwargs,
)
@classmethod
def _estimate_memory_usage_static(
cls,
*,
X: pd.DataFrame,
hyperparameters: dict = None,
**kwargs,
) -> int:
"""
Heuristic memory estimate that is very primitive.
Can be vastly improved.
"""
if hyperparameters is None:
hyperparameters = {}
dataset_size_mem_est = 3 * get_approximate_df_mem_usage(X).sum() # roughly 3x DataFrame memory size
baseline_overhead_mem_est = 1e9 # 1 GB generic overhead
n_rows = X.shape[0]
n_features = X.shape[1]
batch_size = hyperparameters.get("batch_size", cls._get_batch_size(X.shape[0] * X.shape[1]))
embedding_dim = 128
bytes_per_float = 4
model_mem_estimate = 2 * batch_size * embedding_dim * bytes_per_float * (4 + n_rows) * n_features
model_mem_estimate *= 1.3 # add 30% buffer
# FIXME: Likely this is overly conservative now, figure out more accurate memory estimate for TabICLv2
# Early testing shows that cutting this in half is safe.
# TODO: Observed memory spikes above expected values on large datasets, increasing mem estimate to compensate
model_mem_estimate *= 2.0 # Note: 1.5 is not large enough, still gets OOM
mem_estimate = model_mem_estimate + dataset_size_mem_est + baseline_overhead_mem_est
return mem_estimate
@classmethod
def _get_default_ag_args_ensemble(cls, **kwargs) -> dict:
"""
Set fold_fitting_strategy to sequential_local,
as parallel folding crashes if model weights aren't pre-downloaded.
"""
default_ag_args_ensemble = super()._get_default_ag_args_ensemble(**kwargs)
extra_ag_args_ensemble = {
# FIXME: If parallel, uses way more memory, seems to behave incorrectly, so we force sequential.
"fold_fitting_strategy": "sequential_local",
"refit_folds": True, # Better to refit the model for faster inference and similar quality as the bag.
}
default_ag_args_ensemble.update(extra_ag_args_ensemble)
return default_ag_args_ensemble
@classmethod
def _class_tags(cls) -> dict:
return {"can_estimate_memory_usage_static": True}
def _more_tags(self) -> dict:
return {"can_refit_full": True}