Source code for autoqild.automl.tabpfn_classifier

"""AutoTabPFNClassifier is an AutoML model wrapper designed to work with the
TabPFN (Tabular Prior-based Fully Bayesian Network) for classification
tasks."""

import logging

import numpy as np
import torch
from sklearn.metrics import balanced_accuracy_score
from sklearn.utils import check_random_state
from tabpfn import TabPFNClassifier

from autoqild.automl.automl_core import AutomlClassifier
from ..utilities import create_dimensionality_reduction_model


[docs] class AutoTabPFNClassifier(AutomlClassifier): """AutoTabPFNClassifier is an AutoML model wrapper designed to work with the TabPFN (Tabular Prior-based Fully Bayesian Network) for classification tasks. This class provides a high-level interface to automatically build, train, and evaluate a TabPFN model on tabular data. It supports various configurations and allows for dimensionality reduction if the number of features exceeds a specified threshold. The class is equipped to handle different feature reduction techniques and can operate on both CPU and GPU, depending on the available resources. Parameters ---------- n_features : int The number of features in the input data. n_classes : int The number of classes in the classification task. n_ensembles : int, default=100 The number of ensemble configurations used by the TabPFN model. n_reduced : int, default=20 The number of features to reduce to if `n_features` exceeds 50. reduction_technique : str, optional, default=`select_from_model_rf` Technique to use for feature reduction, provided by scikit-learn. Must be one of: - `recursive_feature_elimination_et`: Uses ExtraTreesClassifier to recursively remove features and build a model. - `recursive_feature_elimination_rf`: Uses RandomForestClassifier to recursively remove features and build a model. - `select_from_model_et`: Meta-transformer for selecting features based on importance weights using ExtraTreesClassifier. - `select_from_model_rf`: Meta-transformer for selecting features based on importance weights using RandomForestClassifier. - `pca`: Principal Component Analysis for dimensionality reduction. - `lda`: Linear Discriminant Analysis for separating classes. - `tsne`: t-Distributed Stochastic Neighbor Embedding for visualization purposes. - `nmf`: Non-Negative Matrix Factorization for dimensionality reduction. base_path : str or None, default=None The path where the trained model and other outputs are saved. If None, no model is saved. random_state : int or None, default=None Seed for random number generation to ensure reproducibility. **kwargs : dict Additional keyword arguments. Attributes ---------- n_features : int The number of features in the input data. n_classes : int The number of classes in the classification task. n_ensembles : int The number of ensemble configurations used by the TabPFN model. n_reduced : int The number of features to reduce to if `n_features` exceeds 50. reduction_technique : str The technique used for feature reduction. base_path : str or None The path where the trained model and other outputs are saved. random_state : int or None Seed for random number generation to ensure reproducibility. device : str The device used for computation, either `cpu` or `cuda` depending on the availability of a GPU. selection_model : object or None The model used for dimensionality reduction. Initialized during the first call to `transform`. logger : logging.Logger Logger object used for logging messages and errors. model : TabPFNClassifier or None The TabPFN model object, initialized after fitting. __is_fitted__ : bool Flag indicating whether the dimensionality reduction model is fitted. Private Methods --------------- __clear_memory__() Clear memory to release resources by torch. __transform__(X, y=None): Transform and reduce the feature matrix with `n_features` features, using the specified reduction technique to the feature matrix with `n_reduced` features. """ def __init__( self, n_features, n_classes, n_ensembles=100, n_reduced=20, reduction_technique="select_from_model_rf", base_path=None, random_state=None, **kwargs, ): self.n_features = n_features self.n_classes = n_classes self.logger = logging.getLogger(name=AutoTabPFNClassifier.__name__) self.random_state = check_random_state(random_state) if n_reduced > n_features: self.logger.warning( f"Reduced features {n_reduced} are less than actual features {n_features}" ) self.n_reduced = n_reduced self.reduction_technique = reduction_technique self.selection_model = None self.__is_fitted__ = False if torch.cuda.is_available(): device = "cuda" else: device = "cpu" self.device = device self.logger.info(f"Device {self.device}") self.n_ensembles = n_ensembles self.model = None self.base_path = base_path
[docs] def __transform__(self, X, y=None): """Transform and reduce the feature matrix with `n_features` features, using the specified reduction technique to the feature matrix with `n_reduced` features. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. y : array-like of shape (n_samples,), optional Target vector. Returns ------- X : array-like of shape (n_samples, n_reduced) Transformed feature matrix. """ self.logger.info(f"Before transform n_instances {X.shape[0]} n_features {X.shape[-1]}") if y is not None: classes, n_classes = np.unique(y, return_counts=True) self.logger.info(f"Classes {classes} No of Classes {n_classes}") if not self.__is_fitted__: if self.n_features != X.shape[-1]: raise ValueError(f"Dataset passed does not contain {self.n_features}") if y is not None: if self.n_classes != len(np.unique(y)): raise ValueError(f"Dataset passed does not contain {self.n_classes}") self.selection_model = create_dimensionality_reduction_model( reduction_technique=self.reduction_technique, n_reduced=self.n_reduced ) self.logger.info(f"Creating the model") if self.n_features > 50 and self.n_reduced < self.n_features: self.logger.info( f"Transforming and reducing the {self.n_features} features to {self.n_reduced}" ) self.selection_model.fit(X, y) X = self.selection_model.transform(X) self.__is_fitted__ = True else: if self.n_features > 50 and self.n_reduced < self.n_features: X = self.selection_model.transform(X) self.logger.info(f"After transform n_instances {X.shape[0]} n_features {X.shape[-1]}") return X
[docs] def fit(self, X, y, **kwd): """Fit the TabPFN model to the training data. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. y : array-like of shape (n_samples,) Target vector. **kwd : dict, optional Additional keyword arguments. """ X = self.__transform__(X, y) params = dict( device=self.device, base_path=self.base_path, N_ensemble_configurations=self.n_ensembles, ) if self.base_path is not None: params["base_path"] = self.base_path self.model = TabPFNClassifier(**params) self.model.fit(X, y, overwrite_warning=True) self.__clear_memory__() self.logger.info("Fitting Done")
[docs] def predict(self, X, verbose=0): """Predict class labels for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. verbose : int, optional, default=0 Verbosity level. Returns ------- y_pred : array-like of shape (n_samples,) Predicted class labels. """ p = self.predict_proba(X, verbose=0) y_pred = np.argmax(p, axis=-1) self.logger.info("Predict Done") return y_pred
[docs] def score(self, X, y, sample_weight=None, verbose=0): """Compute the balanced accuracy score for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. y : array-like of shape (n_samples,) True labels. sample_weight : array-like of shape (n_samples,), optional Sample weights. verbose : int, optional, default=0 Verbosity level. Returns ------- acc : float Balanced accuracy score. """ y_pred = self.predict(X) acc = balanced_accuracy_score(y, y_pred) return acc
[docs] def predict_proba(self, X, batch_size=128, verbose=0): """Predict class probabilities for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. batch_size : int, optional, default=32 Number of samples for which predictions are obtained at one time using the learned model. verbose : int, optional, default=0 Verbosity level. Returns ------- y_pred : array-like of shape (n_samples, n_classes) Predicted class probabilities. """ self.logger.info("Predicting Probabilities") n_samples = X.shape[0] X = self.__transform__(X) if batch_size is None: y_pred = self.model.predict_proba(X, normalize_with_test=True, return_logits=False) else: n_batches = np.ceil(n_samples / batch_size).astype(int) predictions = [] for i in range(n_batches): start_idx = i * batch_size end_idx = min((i + 1) * batch_size, n_samples) X_batch = X[start_idx:end_idx] self.logger.info( f"Processing batch {i + 1}/{n_batches} Start id {start_idx} end id {end_idx}" ) batch_pred = self.model.predict_proba( X_batch, normalize_with_test=True, return_logits=False ) predictions.append(batch_pred) y_pred = np.concatenate(predictions, axis=0) self.logger.info("Predicting Probabilities Done") self.__clear_memory__() return y_pred
[docs] def decision_function(self, X, verbose=0): """Compute the decision function in form of class probabilities for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. verbose : int, optional, default=0 Verbosity level. Returns ------- decision : array-like of shape (n_samples,) Decision function values. """ return self.predict_proba(X, verbose)
[docs] @staticmethod def __clear_memory__(): """Clear memory to release resources by torch.""" import gc gc.collect() # Explicitly clear CUDA cache if available if torch.cuda.is_available(): torch.cuda.empty_cache()