Source code for autoqild.automl.autogluon_classifier

"AutoGluonClassifier is a wrapper for building, training, and evaluating an AutoML model using AutoGluon."
import logging
import os.path
import shutil

import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.utils import check_random_state

from autoqild.automl.automl_core import AutomlClassifier
from .model_configurations import hyperparameters, reduced_hyperparameters
from ..utilities.utils import log_exception_error


[docs] class AutoGluonClassifier(AutomlClassifier): """AutoGluonClassifier is a wrapper for building, training, and evaluating an AutoML model using AutoGluon. This class facilitates the use of AutoGluon for automatic machine learning (AutoML) tasks, specifically focusing on classification problems. It handles various aspects of model training, including hyperparameter tuning, model stacking, and model evaluation. The class is designed to work seamlessly with the AutoGluon library, allowing users to leverage its powerful features with minimal setup. Parameters ---------- n_features : int Number of features or dimensionality of the input data. n_classes : int Number of classes in the classification problem. time_limit : int, optional Time limit for training the model, in seconds. Default is 1800. output_folder : str, optional Path to the directory where the trained model and related files will be saved. Default is None. eval_metric : str, optional Evaluation metric used to assess the performance of the model. Default is `accuracy`. use_hyperparameters : bool, optional Flag indicating whether to use predefined hyperparameters for model training. Default is True. delete_tmp_folder_after_terminate : bool, optional Flag indicating whether to delete the temporary folder after model training is complete. Default is True. auto_stack : bool, optional Flag indicating whether to use automatic stacking of models in AutoGluon. Default is True. remove_boosting_models : bool, optional Flag indicating whether to exclude boosting models (like GBM, CAT, XGB) from the hyperparameters. Default is True. verbosity : int, optional Level of verbosity for logging and output. Default is 6. random_state : int or None, optional Seed for random number generation to ensure reproducibility. Default is None. Attributes ---------- logger : logging.Logger Logger object used for logging messages and errors. random_state : np.random.RandomState Random state instance for reproducibility. output_folder : str Path to the directory where the trained model and related files will be saved. delete_tmp_folder_after_terminate : bool Flag indicating whether to delete the temporary folder after model training is complete. hyperparameter_tune_kwargs : dict Dictionary containing options for hyperparameter tuning, including the scheduler and searcher. eval_metric : str Evaluation metric used to assess the performance of the model. use_hyperparameters : bool Flag indicating whether to use predefined hyperparameters for model training. verbosity : int Level of verbosity for logging and output. hyperparameters : dict or None Dictionary of hyperparameters used for model training. If `use_hyperparameters` is False, this is None. exclude_model_types : list List of model types to exclude from the training process. auto_stack : bool Flag indicating whether to use automatic stacking of models in AutoGluon. n_features : int Number of features or dimensionality of the input data. n_classes : int Number of classes in the classification problem. sample_weight : str Method for determining sample weights during training, default is `auto_weight`. time_limit : int Time limit for training the model, in seconds. model : autogluon.tabular.TabularPredictor or None The AutoGluon model object, initialized after fitting. class_label : str Name of the target label column. columns : list List of column names for the input DataFrame, including feature names and the class label. leaderboard : pandas.DataFrame or None DataFrame containing information about the models trained during the fitting process. Private Methods --------------- _is_fitted_() -> bool Property to check if the model is already fitted. """ def __init__( self, n_features, n_classes, time_limit=1800, output_folder=None, eval_metric="accuracy", use_hyperparameters=True, delete_tmp_folder_after_terminate=True, auto_stack=True, remove_boosting_models=True, verbosity=6, random_state=None, **kwargs, ): self.logger = logging.getLogger(name=AutoGluonClassifier.__name__) self.random_state = check_random_state(random_state) self.output_folder = output_folder self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate self.hyperparameter_tune_kwargs = {"scheduler": "local", "searcher": "auto"} self.eval_metric = eval_metric self.use_hyperparameters = use_hyperparameters self.verbosity = verbosity if self.use_hyperparameters: if remove_boosting_models: self.hyperparameters = hyperparameters else: self.hyperparameters = reduced_hyperparameters else: self.hyperparameters = None if remove_boosting_models: self.exclude_model_types = [ "GBM", "CAT", "XGB", "LGB", "KNN", "NN_TORCH", "AG_AUTOMM", "LR", ] else: self.exclude_model_types = ["AG_AUTOMM", "LR"] self.auto_stack = auto_stack self.n_features = n_features self.n_classes = n_classes self.sample_weight = "auto_weight" self.time_limit = time_limit self.model = None self.class_label = "class" self.columns = [f"feature_{i}" for i in range(self.n_features)] + [self.class_label] if self.n_classes > 2: self.problem_type = "multiclass" if self.n_classes == 2: self.problem_type = "binary" self.leaderboard = None @property def _is_fitted_(self) -> bool: """Check if the model is already fitted. Returns ------- _is_fitted_ : bool True if the model is fitted, False otherwise. """ basename = os.path.basename(self.output_folder) if os.path.exists(self.output_folder): try: self.model = TabularPredictor.load(self.output_folder) self.logger.info(f"Loading the model at {basename}") self.leaderboard = self.model.leaderboard(extra_info=True) except Exception as error: log_exception_error(self.logger, error) self.logger.error(f"Cannot load the trained model at {basename}") self.model = None if self.model is not None: self.leaderboard = self.model.leaderboard(extra_info=True) time_taken = ( self.leaderboard["fit_time"].sum() + self.leaderboard["pred_time_val"].sum() + 20 ) difference = self.time_limit - time_taken if 200 <= self.time_limit < 300: limit = 150 elif self.time_limit >= 3000: limit = 2000 else: limit = 200 self.logger.info( f"Fitting time of the model {time_taken} and remaining {difference}, limit {limit}" ) num_models = len(self.leaderboard["fit_time"]) self.logger.info(f"Number of models trained is {num_models} ") if num_models < 1200: if num_models <= 50: self.model = None self.logger.info(f"Retraining the model since they are less than 50") if difference >= limit: self.model = None else: self.logger.info("Enough models trained") if self.model is None: try: shutil.rmtree(self.output_folder) self.logger.error( f"Since the model is not completely fitted, the folder '{basename}' " f"and its contents are deleted successfully." ) except OSError as error: log_exception_error(self.logger, error) self.logger.error(f"Folder does not exist") return self.model is not None
[docs] def fit(self, X, y, **kwd): """Fit the AutoGluon model to the training data. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. y : array-like of shape (n_samples,) Target vector. **kwd : dict, optional Additional keyword arguments. """ self.logger.info("Fitting Started") train_data = self.convert_to_dataframe(X, y) while not self._is_fitted_: try: self.logger.info("Fitting the model from scratch") self.model = TabularPredictor( label=self.class_label, sample_weight=self.sample_weight, problem_type=self.problem_type, eval_metric=self.eval_metric, path=self.output_folder, verbosity=self.verbosity, ) self.model.fit( train_data, time_limit=self.time_limit, hyperparameters=self.hyperparameters, hyperparameter_tune_kwargs=self.hyperparameter_tune_kwargs, auto_stack=self.auto_stack, excluded_model_types=self.exclude_model_types, ) except Exception as error: log_exception_error(self.logger, error) self.logger.error("Fit function did not work, checking the saved models") self.leaderboard = self.model.leaderboard(extra_info=True) if self.delete_tmp_folder_after_terminate: self.model.delete_models(models_to_keep="best", dry_run=False) self.model.save_space()
[docs] def predict(self, X, verbose=0): """Predict class labels for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. verbose : int, optional, default=0 Verbosity level. Returns ------- y_pred : array-like of shape (n_samples,) Predicted class labels. """ test_data = self.convert_to_dataframe(X, None) y_pred = self.model.predict(test_data) return y_pred.values
[docs] def score(self, X, y, sample_weight=None, verbose=0): """Compute the balanced accuracy score for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. y : array-like of shape (n_samples,) True labels. sample_weight : array-like of shape (n_samples,), optional Sample weights. verbose : int, optional, default=0 Verbosity level. Returns ------- score : float Balanced accuracy score. """ test_data = self.convert_to_dataframe(X, y) score = self.model.evaluate(test_data)["balanced_accuracy"] return score
[docs] def predict_proba(self, X, verbose=0): """Predict class probabilities for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. verbose : int, optional, default=0 Verbosity level. Returns ------- y_pred : array-like of shape (n_samples, n_classes) Predicted class probabilities. """ test_data = self.convert_to_dataframe(X, None) y_pred = self.model.predict_proba(test_data) return y_pred.values
[docs] def decision_function(self, X, verbose=0): """Compute the decision function in form of class probabilities for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. verbose : int, optional, default=0 Verbosity level. Returns ------- decision : array-like of shape (n_samples,) Decision function values. """ test_data = self.convert_to_dataframe(X, None) y_pred = self.model.predict_proba(test_data) return y_pred.values
[docs] def convert_to_dataframe(self, X, y=None): """Convert the input data to a DataFrame. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix. y : array-like of shape (n_samples,), optional Target vector. Returns ------- df_data : pandas.DataFrame DataFrame containing the input data. """ X = np.asarray(X) if y is not None: y = np.asarray(y) else: n_instances = X.shape[0] y = self.random_state.choice(self.n_classes, size=n_instances) X = np.copy(X) X.flags.writeable = True y = np.copy(y) y.flags.writeable = True data = np.concatenate((X, y[:, None]), axis=1) if self.n_features != X.shape[-1]: raise ValueError(f"Dataset passed does not contain {self.n_features} features") df_data = pd.DataFrame(data=data, columns=self.columns) return df_data
[docs] def get_k_rank_model(self, k): """Get the k-th ranked model from the leaderboard. Parameters ---------- k : int Rank of the model to retrieve. Returns ------- model : autogluon.tabular.TabularPredictor The k-th ranked model. """ self.leaderboard.sort_values(["score_val"], ascending=False, inplace=True) model_name = self.leaderboard.iloc[k - 1]["model"] model = self.model._trainer.load_model(model_name) return model
[docs] def get_model(self, model_name): """Get a model by its name from the leaderboard. Parameters ---------- model_name : str Name of the model to retrieve. Returns ------- model : autogluon.tabular.TabularPredictor The specified model. """ self.leaderboard.sort_values(["score_val"], ascending=False, inplace=True) model = self.model._trainer.load_model(model_name) return model