"""AutoTabPFNClassifier is an AutoML model wrapper designed to work with the
TabPFN (Tabular Prior-based Fully Bayesian Network) for classification
tasks."""
import logging
import numpy as np
import torch
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils import check_random_state
from tabpfn import TabPFNClassifier
from autoqild.automl.automl_core import AutomlClassifier
from ..utilities import create_dimensionality_reduction_model
[docs]
class AutoTabPFNClassifier(AutomlClassifier):
"""AutoTabPFNClassifier is an AutoML model wrapper designed to work with
the TabPFN (Tabular Prior-based Fully Bayesian Network) for classification
tasks.
This class provides a high-level interface to automatically build, train, and evaluate a
TabPFN model on tabular data. It supports various configurations and allows for dimensionality
reduction if the number of features exceeds a specified threshold. The class is equipped to
handle different feature reduction techniques and can operate on both CPU and GPU, depending on
the available resources.
Parameters
----------
n_features : int
The number of features in the input data.
n_classes : int
The number of classes in the classification task.
n_ensembles : int, default=100
The number of ensemble configurations used by the TabPFN model.
n_reduced : int, default=20
The number of features to reduce to if `n_features` exceeds 50.
reduction_technique : str, optional, default=`select_from_model_rf`
Technique to use for feature reduction, provided by scikit-learn.
Must be one of:
- `recursive_feature_elimination_et`: Uses ExtraTreesClassifier to recursively remove features and build a model.
- `recursive_feature_elimination_rf`: Uses RandomForestClassifier to recursively remove features and build a model.
- `select_from_model_et`: Meta-transformer for selecting features based on importance weights using ExtraTreesClassifier.
- `select_from_model_rf`: Meta-transformer for selecting features based on importance weights using RandomForestClassifier.
- `pca`: Principal Component Analysis for dimensionality reduction.
- `lda`: Linear Discriminant Analysis for separating classes.
- `tsne`: t-Distributed Stochastic Neighbor Embedding for visualization purposes.
- `nmf`: Non-Negative Matrix Factorization for dimensionality reduction.
base_path : str or None, default=None
The path where the trained model and other outputs are saved. If None, no model is saved.
random_state : int or None, default=None
Seed for random number generation to ensure reproducibility.
**kwargs : dict
Additional keyword arguments.
Attributes
----------
n_features : int
The number of features in the input data.
n_classes : int
The number of classes in the classification task.
n_ensembles : int
The number of ensemble configurations used by the TabPFN model.
n_reduced : int
The number of features to reduce to if `n_features` exceeds 50.
reduction_technique : str
The technique used for feature reduction.
base_path : str or None
The path where the trained model and other outputs are saved.
random_state : int or None
Seed for random number generation to ensure reproducibility.
device : str
The device used for computation, either `cpu` or `cuda` depending on the availability of a GPU.
selection_model : object or None
The model used for dimensionality reduction. Initialized during the first call to `transform`.
logger : logging.Logger
Logger object used for logging messages and errors.
model : TabPFNClassifier or None
The TabPFN model object, initialized after fitting.
__is_fitted__ : bool
Flag indicating whether the dimensionality reduction model is fitted.
Private Methods
---------------
__clear_memory__()
Clear memory to release resources by torch.
__transform__(X, y=None):
Transform and reduce the feature matrix with `n_features` features, using the specified reduction
technique to the feature matrix with `n_reduced` features.
"""
def __init__(
self,
n_features,
n_classes,
n_ensembles=100,
n_reduced=20,
reduction_technique="select_from_model_rf",
base_path=None,
random_state=None,
**kwargs,
):
self.n_features = n_features
self.n_classes = n_classes
self.logger = logging.getLogger(name=AutoTabPFNClassifier.__name__)
self.random_state = check_random_state(random_state)
if n_reduced > n_features:
self.logger.warning(
f"Reduced features {n_reduced} are less than actual features {n_features}"
)
self.n_reduced = n_reduced
self.reduction_technique = reduction_technique
self.selection_model = None
self.__is_fitted__ = False
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
self.device = device
self.logger.info(f"Device {self.device}")
self.n_ensembles = n_ensembles
self.model = None
self.base_path = base_path
[docs]
def fit(self, X, y, **kwd):
"""Fit the TabPFN model to the training data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix.
y : array-like of shape (n_samples,)
Target vector.
**kwd : dict, optional
Additional keyword arguments.
"""
X = self.__transform__(X, y)
params = dict(
device=self.device,
base_path=self.base_path,
N_ensemble_configurations=self.n_ensembles,
)
if self.base_path is not None:
params["base_path"] = self.base_path
self.model = TabPFNClassifier(**params)
self.model.fit(X, y, overwrite_warning=True)
self.__clear_memory__()
self.logger.info("Fitting Done")
[docs]
def predict(self, X, verbose=0):
"""Predict class labels for the input samples.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix.
verbose : int, optional, default=0
Verbosity level.
Returns
-------
y_pred : array-like of shape (n_samples,)
Predicted class labels.
"""
p = self.predict_proba(X, verbose=0)
y_pred = np.argmax(p, axis=-1)
self.logger.info("Predict Done")
return y_pred
[docs]
def score(self, X, y, sample_weight=None, verbose=0):
"""Compute the balanced accuracy score for the input samples.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix.
y : array-like of shape (n_samples,)
True labels.
sample_weight : array-like of shape (n_samples,), optional
Sample weights.
verbose : int, optional, default=0
Verbosity level.
Returns
-------
acc : float
Balanced accuracy score.
"""
y_pred = self.predict(X)
acc = balanced_accuracy_score(y, y_pred)
return acc
[docs]
def predict_proba(self, X, batch_size=128, verbose=0):
"""Predict class probabilities for the input samples.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix.
batch_size : int, optional, default=32
Number of samples for which predictions are obtained at one time using the learned model.
verbose : int, optional, default=0
Verbosity level.
Returns
-------
y_pred : array-like of shape (n_samples, n_classes)
Predicted class probabilities.
"""
self.logger.info("Predicting Probabilities")
n_samples = X.shape[0]
X = self.__transform__(X)
if batch_size is None:
y_pred = self.model.predict_proba(X, normalize_with_test=True, return_logits=False)
else:
n_batches = np.ceil(n_samples / batch_size).astype(int)
predictions = []
for i in range(n_batches):
start_idx = i * batch_size
end_idx = min((i + 1) * batch_size, n_samples)
X_batch = X[start_idx:end_idx]
self.logger.info(
f"Processing batch {i + 1}/{n_batches} Start id {start_idx} end id {end_idx}"
)
batch_pred = self.model.predict_proba(
X_batch, normalize_with_test=True, return_logits=False
)
predictions.append(batch_pred)
y_pred = np.concatenate(predictions, axis=0)
self.logger.info("Predicting Probabilities Done")
self.__clear_memory__()
return y_pred
[docs]
def decision_function(self, X, verbose=0):
"""Compute the decision function in form of class probabilities for the
input samples.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix.
verbose : int, optional, default=0
Verbosity level.
Returns
-------
decision : array-like of shape (n_samples,)
Decision function values.
"""
return self.predict_proba(X, verbose)
[docs]
@staticmethod
def __clear_memory__():
"""Clear memory to release resources by torch."""
import gc
gc.collect()
# Explicitly clear CUDA cache if available
if torch.cuda.is_available():
torch.cuda.empty_cache()