Source code for autoqild.automl.autogluon_classifier

"AutoGluonClassifier is a wrapper for building, training, and evaluating an AutoML model using AutoGluon."
import logging
import os.path
import shutil

import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.utils import check_random_state

from autoqild.automl.automl_core import AutomlClassifier
from .model_configurations import hyperparameters, reduced_hyperparameters
from ..utilities.utils import log_exception_error



[docs]
class AutoGluonClassifier(AutomlClassifier):
    """AutoGluonClassifier is a wrapper for building, training, and evaluating
    an AutoML model using AutoGluon.

    This class facilitates the use of AutoGluon for automatic machine learning (AutoML) tasks,
    specifically focusing on classification problems. It handles various aspects of model training,
    including hyperparameter tuning, model stacking, and model evaluation. The class is designed to
    work seamlessly with the AutoGluon library, allowing users to leverage its powerful features with
    minimal setup.

    Parameters
    ----------
    n_features : int
        Number of features or dimensionality of the input data.
    n_classes : int
        Number of classes in the classification problem.
    time_limit : int, optional
        Time limit for training the model, in seconds. Default is 1800.
    output_folder : str, optional
        Path to the directory where the trained model and related files will be saved. Default is None.
    eval_metric : str, optional
        Evaluation metric used to assess the performance of the model. Default is `accuracy`.
    use_hyperparameters : bool, optional
        Flag indicating whether to use predefined hyperparameters for model training. Default is True.
    delete_tmp_folder_after_terminate : bool, optional
        Flag indicating whether to delete the temporary folder after model training is complete. Default is True.
    auto_stack : bool, optional
        Flag indicating whether to use automatic stacking of models in AutoGluon. Default is True.
    remove_boosting_models : bool, optional
        Flag indicating whether to exclude boosting models (like GBM, CAT, XGB) from the hyperparameters. Default is True.
    verbosity : int, optional
        Level of verbosity for logging and output. Default is 6.
    random_state : int or None, optional
        Seed for random number generation to ensure reproducibility. Default is None.

    Attributes
    ----------
    logger : logging.Logger
        Logger object used for logging messages and errors.
    random_state : np.random.RandomState
        Random state instance for reproducibility.
    output_folder : str
        Path to the directory where the trained model and related files will be saved.
    delete_tmp_folder_after_terminate : bool
        Flag indicating whether to delete the temporary folder after model training is complete.
    hyperparameter_tune_kwargs : dict
        Dictionary containing options for hyperparameter tuning, including the scheduler and searcher.
    eval_metric : str
        Evaluation metric used to assess the performance of the model.
    use_hyperparameters : bool
        Flag indicating whether to use predefined hyperparameters for model training.
    verbosity : int
        Level of verbosity for logging and output.
    hyperparameters : dict or None
        Dictionary of hyperparameters used for model training. If `use_hyperparameters` is False, this is None.
    exclude_model_types : list
        List of model types to exclude from the training process.
    auto_stack : bool
        Flag indicating whether to use automatic stacking of models in AutoGluon.
    n_features : int
        Number of features or dimensionality of the input data.
    n_classes : int
        Number of classes in the classification problem.
    sample_weight : str
        Method for determining sample weights during training, default is `auto_weight`.
    time_limit : int
        Time limit for training the model, in seconds.
    model : autogluon.tabular.TabularPredictor or None
        The AutoGluon model object, initialized after fitting.
    class_label : str
        Name of the target label column.
    columns : list
        List of column names for the input DataFrame, including feature names and the class label.
    leaderboard : pandas.DataFrame or None
        DataFrame containing information about the models trained during the fitting process.

    Private Methods
    ---------------
    _is_fitted_() -> bool
        Property to check if the model is already fitted.
    """

    def __init__(
        self,
        n_features,
        n_classes,
        time_limit=1800,
        output_folder=None,
        eval_metric="accuracy",
        use_hyperparameters=True,
        delete_tmp_folder_after_terminate=True,
        auto_stack=True,
        remove_boosting_models=True,
        verbosity=6,
        random_state=None,
        **kwargs,
    ):
        self.logger = logging.getLogger(name=AutoGluonClassifier.__name__)
        self.random_state = check_random_state(random_state)
        self.output_folder = output_folder
        self.delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate
        self.hyperparameter_tune_kwargs = {"scheduler": "local", "searcher": "auto"}
        self.eval_metric = eval_metric
        self.use_hyperparameters = use_hyperparameters
        self.verbosity = verbosity
        if self.use_hyperparameters:
            if remove_boosting_models:
                self.hyperparameters = hyperparameters
            else:
                self.hyperparameters = reduced_hyperparameters
        else:
            self.hyperparameters = None
        if remove_boosting_models:
            self.exclude_model_types = [
                "GBM",
                "CAT",
                "XGB",
                "LGB",
                "KNN",
                "NN_TORCH",
                "AG_AUTOMM",
                "LR",
            ]
        else:
            self.exclude_model_types = ["AG_AUTOMM", "LR"]
        self.auto_stack = auto_stack
        self.n_features = n_features
        self.n_classes = n_classes
        self.sample_weight = "auto_weight"
        self.time_limit = time_limit
        self.model = None
        self.class_label = "class"
        self.columns = [f"feature_{i}" for i in range(self.n_features)] + [self.class_label]
        if self.n_classes > 2:
            self.problem_type = "multiclass"
        if self.n_classes == 2:
            self.problem_type = "binary"
        self.leaderboard = None

    @property
    def _is_fitted_(self) -> bool:
        """Check if the model is already fitted.

        Returns
        -------
        _is_fitted_ : bool
            True if the model is fitted, False otherwise.
        """
        basename = os.path.basename(self.output_folder)
        if os.path.exists(self.output_folder):
            try:
                self.model = TabularPredictor.load(self.output_folder)
                self.logger.info(f"Loading the model at {basename}")
                self.leaderboard = self.model.leaderboard(extra_info=True)
            except Exception as error:
                log_exception_error(self.logger, error)
                self.logger.error(f"Cannot load the trained model at {basename}")
                self.model = None

        if self.model is not None:
            self.leaderboard = self.model.leaderboard(extra_info=True)
            time_taken = (
                self.leaderboard["fit_time"].sum() + self.leaderboard["pred_time_val"].sum() + 20
            )
            difference = self.time_limit - time_taken
            if 200 <= self.time_limit < 300:
                limit = 150
            elif self.time_limit >= 3000:
                limit = 2000
            else:
                limit = 200
            self.logger.info(
                f"Fitting time of the model {time_taken} and remaining {difference}, limit {limit}"
            )
            num_models = len(self.leaderboard["fit_time"])
            self.logger.info(f"Number of models trained is {num_models} ")
            if num_models < 1200:
                if num_models <= 50:
                    self.model = None
                    self.logger.info(f"Retraining the model since they are less than 50")
                if difference >= limit:
                    self.model = None
            else:
                self.logger.info("Enough models trained")

        if self.model is None:
            try:
                shutil.rmtree(self.output_folder)
                self.logger.error(
                    f"Since the model is not completely fitted, the folder '{basename}' "
                    f"and its contents are deleted successfully."
                )
            except OSError as error:
                log_exception_error(self.logger, error)
                self.logger.error(f"Folder does not exist")
        return self.model is not None


[docs]
    def fit(self, X, y, **kwd):
        """Fit the AutoGluon model to the training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        y : array-like of shape (n_samples,)
            Target vector.

        **kwd : dict, optional
            Additional keyword arguments.
        """
        self.logger.info("Fitting Started")
        train_data = self.convert_to_dataframe(X, y)
        while not self._is_fitted_:
            try:
                self.logger.info("Fitting the model from scratch")
                self.model = TabularPredictor(
                    label=self.class_label,
                    sample_weight=self.sample_weight,
                    problem_type=self.problem_type,
                    eval_metric=self.eval_metric,
                    path=self.output_folder,
                    verbosity=self.verbosity,
                )
                self.model.fit(
                    train_data,
                    time_limit=self.time_limit,
                    hyperparameters=self.hyperparameters,
                    hyperparameter_tune_kwargs=self.hyperparameter_tune_kwargs,
                    auto_stack=self.auto_stack,
                    excluded_model_types=self.exclude_model_types,
                )
            except Exception as error:
                log_exception_error(self.logger, error)
                self.logger.error("Fit function did not work, checking the saved models")
        self.leaderboard = self.model.leaderboard(extra_info=True)
        if self.delete_tmp_folder_after_terminate:
            self.model.delete_models(models_to_keep="best", dry_run=False)
            self.model.save_space()



[docs]
    def predict(self, X, verbose=0):
        """Predict class labels for the input samples.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        verbose : int, optional, default=0
            Verbosity level.

        Returns
        -------
        y_pred : array-like of shape (n_samples,)
            Predicted class labels.
        """
        test_data = self.convert_to_dataframe(X, None)
        y_pred = self.model.predict(test_data)
        return y_pred.values



[docs]
    def score(self, X, y, sample_weight=None, verbose=0):
        """Compute the balanced accuracy score for the input samples.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        y : array-like of shape (n_samples,)
            True labels.

        sample_weight : array-like of shape (n_samples,), optional
            Sample weights.

        verbose : int, optional, default=0
            Verbosity level.

        Returns
        -------
        score : float
            Balanced accuracy score.
        """
        test_data = self.convert_to_dataframe(X, y)
        score = self.model.evaluate(test_data)["balanced_accuracy"]
        return score



[docs]
    def predict_proba(self, X, verbose=0):
        """Predict class probabilities for the input samples.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        verbose : int, optional, default=0
            Verbosity level.

        Returns
        -------
        y_pred : array-like of shape (n_samples, n_classes)
            Predicted class probabilities.
        """
        test_data = self.convert_to_dataframe(X, None)
        y_pred = self.model.predict_proba(test_data)
        return y_pred.values



[docs]
    def decision_function(self, X, verbose=0):
        """Compute the decision function in form of class probabilities for the
        input samples.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        verbose : int, optional, default=0
            Verbosity level.

        Returns
        -------
        decision : array-like of shape (n_samples,)
            Decision function values.
        """
        test_data = self.convert_to_dataframe(X, None)
        y_pred = self.model.predict_proba(test_data)
        return y_pred.values



[docs]
    def convert_to_dataframe(self, X, y=None):
        """Convert the input data to a DataFrame.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        y : array-like of shape (n_samples,), optional
            Target vector.

        Returns
        -------
        df_data : pandas.DataFrame
            DataFrame containing the input data.
        """
        X = np.asarray(X)
        if y is not None:
            y = np.asarray(y)
        else:
            n_instances = X.shape[0]
            y = self.random_state.choice(self.n_classes, size=n_instances)

        X = np.copy(X)
        X.flags.writeable = True
        y = np.copy(y)
        y.flags.writeable = True

        data = np.concatenate((X, y[:, None]), axis=1)

        if self.n_features != X.shape[-1]:
            raise ValueError(f"Dataset passed does not contain {self.n_features} features")

        df_data = pd.DataFrame(data=data, columns=self.columns)
        return df_data



[docs]
    def get_k_rank_model(self, k):
        """Get the k-th ranked model from the leaderboard.

        Parameters
        ----------
        k : int
            Rank of the model to retrieve.

        Returns
        -------
        model : autogluon.tabular.TabularPredictor
            The k-th ranked model.
        """
        self.leaderboard.sort_values(["score_val"], ascending=False, inplace=True)
        model_name = self.leaderboard.iloc[k - 1]["model"]
        model = self.model._trainer.load_model(model_name)
        return model



[docs]
    def get_model(self, model_name):
        """Get a model by its name from the leaderboard.

        Parameters
        ----------
        model_name : str
            Name of the model to retrieve.

        Returns
        -------
        model : autogluon.tabular.TabularPredictor
            The specified model.
        """
        self.leaderboard.sort_values(["score_val"], ascending=False, inplace=True)
        model = self.model._trainer.load_model(model_name)
        return model