Source code for autoqild.mi_estimators.gmm_mi_estimator

"""Gaussian Mixture Model-based MI estimator for evaluating mutual information
using probabilistic clustering."""

import copy
import logging

import numpy as np
from infoselect import get_gmm, SelectVars
from sklearn.linear_model import LogisticRegression

from autoqild.mi_estimators.mi_base_class import MIEstimatorBase
from ..utilities import create_dimensionality_reduction_model, log_exception_error



[docs]
class GMMMIEstimator(MIEstimatorBase):
    """GMMMIEstimator class for estimating Mutual Information (MI) using
    Gaussian Mixture Models (GMMs) and performing classification using Logistic
    Regression.

    This class leverages GMMs to estimate mutual information and uses feature reduction techniques
    to create a robust classification model. It evaluates different GMMs based on goodness-of-fit measures
    such as AIC, BIC, and log-likelihood.

    Parameters
    ----------
    n_classes : int
        Number of classes in the classification data samples.

    n_features : int
        Number of features or dimensionality of the inputs of the classification data samples.

    y_cat : bool, optional, default=False
        Indicates if the target variable should be considered categorical or real-valued.

    covariance_type : {`full`, `tied`, `diag`, `spherical`}, default=`full`
        String describing the type of covariance parameters to use. Must be one of:

        - `full`: each component has its own general covariance matrix.
        - `tied`: all components share the same general covariance matrix.
        - `diag`: each component has its own diagonal covariance matrix.
        - `spherical`: each component has its own single variance.

    reg_covar : float, default=1e-6
        Non-negative regularization added to the diagonal of covariance. Ensures that the covariance matrices are all positive.

    val_size : float, optional, default=0.30
        Validation set size as a proportion of the dataset to estimate GMMs.

    n_reduced : int, optional, default=20
        Number of features to reduce to in case n_features > 100.

    reduction_technique : str, optional, default=`select_from_model_rf`
        Technique to use for feature reduction, provided by scikit-learn.
        Must be one of:

        - `recursive_feature_elimination_et`: Uses ExtraTreesClassifier to recursively remove features and build a model.
        - `recursive_feature_elimination_rf`: Uses RandomForestClassifier to recursively remove features and build a model.
        - `select_from_model_et`: Meta-transformer for selecting features based on importance weights using ExtraTreesClassifier.
        - `select_from_model_rf`: Meta-transformer for selecting features based on importance weights using RandomForestClassifier.
        - `pca`: Principal Component Analysis for dimensionality reduction.
        - `lda`: Linear Discriminant Analysis for separating classes.
        - `tsne`: t-Distributed Stochastic Neighbor Embedding for visualization purposes.
        - `nmf`: Non-Negative Matrix Factorization for dimensionality reduction.

    random_state : int or object, optional, default=42
        Random state for reproducibility.

    **kwargs : dict, optional
        Additional keyword arguments.

    Attributes
    ----------
    y_cat : bool
        Indicates if the target variable should be considered categorical or real-valued.

    num_comps : list
        List of component counts for GMM evaluation.

    reg_covar : float
        Regularization parameter for the GMM covariance matrices.

    n_models : int
        Number of GMM models to fit and evaluate.

    covariance_type : str
        The covariance type for the GMM.

    val_size : float
        Validation set size as a proportion of the dataset.

    n_reduced : int
        Number of reduced features for dimensionality reduction.

    reduction_technique : str
        Technique used for feature reduction.

    selection_model : object or None
        The fitted feature selection model, or None if not yet fitted.

    __is_fitted__ : bool
        Indicates whether the model is fitted.

    cls_model : LogisticRegression
        The classification model used after feature reduction.

    best_model : object or None
        The best fitted GMM model based on likelihood, or None if no model is selected.

    best_gmm_model : object or None
        The best fitted GMM used for mutual information estimation.

    best_likelihood : float or None
        The highest log-likelihood score achieved during model evaluation.

    best_bic : float or None
        The best Bayesian Information Criterion (BIC) score.

    best_aic : float or None
        The best Akaike Information Criterion (AIC) score.

    best_mi : float or None
        The best estimated mutual information.

    best_seed : int or None
        The random seed used to achieve the best model.

    round : int or None
        The optimal round for feature selection.

    logger : logging.Logger
        Logger instance for logging information.

    Private Methods
    ---------------
    __get_goodnessof_fit__(gmm, X, y):
        Calculate goodness of fit for the GMM model(s) used for MI estimation using Gaussian Mixture Models (GMMs).

    __transform__(X, y=None):
        Transform and reduce the feature matrix with 'n_features' features, using the specified reduction
        technique to the feature matrix with 'n_reduced' features.
    """

    def __init__(
        self,
        n_classes,
        n_features,
        y_cat=False,
        covariance_type="full",
        reg_covar=1e-06,
        val_size=0.30,
        n_reduced=20,
        reduction_technique="select_from_model_rf",
        random_state=42,
        **kwargs,
    ):
        super().__init__(n_classes=n_classes, n_features=n_features, random_state=random_state)
        self.y_cat = y_cat
        self.num_comps = list(np.arange(2, 20, 2))
        self.reg_covar = reg_covar
        self.n_models = 5
        self.covariance_type = covariance_type
        self.val_size = val_size
        if n_reduced > n_features:
            self.logger.warning(
                f"Reduced features {n_reduced} are less than actual features {n_features}"
            )
        self.n_reduced = n_reduced
        self.reduction_technique = reduction_technique
        self.selection_model = None
        self.__is_fitted__ = False

        # Classification Model
        self.cls_model = None
        self.best_model = None
        self.best_gmm_model = None
        self.best_likelihood = None
        self.best_bic = None
        self.best_aic = None
        self.best_mi = None
        self.best_seed = None
        self.round = None
        self.logger = logging.getLogger(GMMMIEstimator.__name__)


[docs]
    def __get_goodnessof_fit__(self, gmm, X, y):
        """Calculate goodness of fit for the GMM model(s) used for estimating
        the Mutual Information (MI) using Gaussian Mixture Models (GMMs).

        Parameters
        ----------
        gmm : GMM or dict
            Gaussian Mixture Model or dictionary of GMMs.

        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        y : array-like of shape (n_samples,)
            Target vector.

        Returns
        -------
        aic_fit : float
            Akaike information criterion for the current model on the input X.

        bic_fit : float
            Bayesian information criterion for the current model on the input X.

        likelihood : float
            Compute the per-sample average log-likelihood of the given data X.

        n_components : int
            Number of components in the GMM.
        """
        if isinstance(gmm, dict):
            classes = list(set(y))
            bic_fit = []
            likelihood = []
            n_components = []
            aic_fit = []
            for c in classes:
                bic_fit.append(gmm[c].bic(X[y == c]))
                aic_fit.append(gmm[c].aic(X[y == c]))
                likelihood.append(gmm[c].score(X[y == c]))
                n_components.append(gmm[c].n_components)
            bic_fit = np.sum(bic_fit)
            aic_fit = np.sum(aic_fit)
            likelihood = np.mean(likelihood)
            n_components = np.mean(n_components)
        else:
            Z = np.hstack((y.reshape((-1, 1)), X))
            bic_fit = gmm.bic(Z)
            aic_fit = gmm.aic(Z)
            likelihood = gmm.score(Z)
            n_components = gmm.n_components
        self.logger.info(f"AIC: {aic_fit}, BIC: {bic_fit}, Likelihood score {likelihood}")
        return aic_fit, bic_fit, likelihood, n_components



[docs]
    def __transform__(self, X, y=None):
        """Transform and reduce the feature matrix with 'n_features' features,
        using the specified reduction technique to the feature matrix with
        'n_reduced' features.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        y : array-like of shape (n_samples,), optional
            Target vector.

        Returns
        -------
        X : array-like of shape (n_samples, n_reduced)
            Transformed feature matrix.
        """
        self.logger.info(f"Before transform n_instances {X.shape[0]} n_features {X.shape[-1]}")
        if y is not None:
            classes, n_classes = np.unique(y, return_counts=True)
            self.logger.info(f"Classes {classes} No of Classes {n_classes}")
        if not self.__is_fitted__:
            if self.n_features != X.shape[-1]:
                raise ValueError(f"Dataset passed does not contain {self.n_features}")
            if y is not None:
                if self.n_classes != len(np.unique(y)):
                    raise ValueError(f"Dataset passed does not contain {self.n_classes}")
            self.selection_model = create_dimensionality_reduction_model(
                reduction_technique=self.reduction_technique, n_reduced=self.n_reduced
            )
            self.logger.info(f"Creating the model")
            if self.n_features > 50 and self.n_reduced < self.n_features:
                self.logger.info(
                    f"Transforming and reducing the {self.n_features} features to {self.n_reduced}"
                )
                self.selection_model.fit(X, y)
                X = self.selection_model.transform(X)
                self.__is_fitted__ = True
        else:
            if self.n_features > 50 and self.n_reduced < self.n_features:
                X = self.selection_model.transform(X)
        self.logger.info(f"After transform n_instances {X.shape[0]} n_features {X.shape[-1]}")
        return X



[docs]
    def fit(self, X, y, verbose=0, **kwd):
        """Fit the GMM model and estimate mutual information.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        y : array-like of shape (n_samples,)
            Target vector.

        verbose : int, optional, default=0
            print or not to print!?.

        **kwd : dict, optional
            Additional keyword arguments.

        Returns
        -------
        self : GMMMIEstimator
            Fitted estimator.
        """
        X = self.__transform__(X, y)
        self.best_likelihood = -np.inf
        seed = self.random_state.randint(2**31, dtype="uint32")
        for iter_ in range(self.n_models):
            # self.logger.info(f"++++++++++++++++++ GMM Model {iter_} ++++++++++++++++++")
            try:
                gmm = get_gmm(
                    X,
                    y,
                    covariance_type=self.covariance_type,
                    y_cat=self.y_cat,
                    num_comps=self.num_comps,
                    reg_covar=self.reg_covar,
                    val_size=self.val_size,
                    random_state=seed + iter_,
                )
                self.logger.info(f"GMM Model {gmm}")
                select = SelectVars(gmm, selection_mode="backward")
                select.fit(X, y, verbose=verbose, eps=np.finfo(np.float32).eps)
                mi_mean, _ = (
                    select.get_info().values[0][1],
                    select.get_info().values[0][2],
                )
                mi = np.max([mi_mean, 0.0]) * np.log2(np.e)
                if not (np.isnan(mi) or np.isinf(mi)):
                    aic, bic, likelihood, n_components = self.__get_goodnessof_fit__(gmm, X, y)
                    # self.logger.info(f"MI {np.around(mi, 4)}  BIC {np.around(bic, 4)} Likelihood "
                    #                 f"{np.around(likelihood, 4)} n_components {n_components}")
                    if self.best_likelihood < likelihood:
                        self.logger.info(
                            f"GMM Model {iter_} set best with likelihood {np.around(likelihood, 4)} "
                            f"AIC {np.around(aic, 4)} BIC {np.around(bic, 4)} MI {np.around(mi, 4)}"
                        )
                        self.best_likelihood = likelihood
                        self.best_bic = bic
                        self.best_aic = aic
                        self.best_mi = mi
                        self.best_model = copy.deepcopy(select)
                        self.best_seed = seed + iter_
                        self.best_gmm_model = get_gmm(
                            X,
                            y,
                            covariance_type=self.covariance_type,
                            y_cat=self.y_cat,
                            num_comps=self.num_comps,
                            reg_covar=self.reg_covar,
                            val_size=self.val_size,
                            random_state=seed + iter_,
                        )
                else:
                    self.logger.info(f"Model {iter_} trained estimates wrong MI")
            except Exception as error:
                log_exception_error(self.logger, error)
                self.logger.error(f"Model {iter_} was not valid ")
            # self.logger.info(f"+++++++++++++++++++++++++++++++++++++++++++++++++++++++")

        self.create_classification_model(X, y)
        return self



[docs]
    def create_classification_model(self, X, y, **kwd):
        """Create the logistic regression classification model on reduced
        feature space with n_reduced features.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        y : array-like of shape (n_samples,)
            Target vector.

        **kwd : dict, optional
            Additional keyword arguments.
        """
        self.logger.debug(f"Best Model is not None out of {self.n_models} seed {self.best_seed}")
        X = self.__transform__(X, y)
        if self.best_model is not None:
            idx = np.where(self.best_model.get_info()["delta"].values < 0)
            try:
                # self.logger.info(self.best_model.get_info())
                # self.logger.info(f"Indices {idx[0]}")
                self.round = idx[0][0] - 1
            except IndexError as error:
                # log_exception_error(self.logger, error)
                self.round = 0
            X_new = self.best_model.transform(X, rd=self.round)
            self.cls_model = LogisticRegression()
            self.cls_model.fit(X_new, y)
        else:
            self.cls_model = LogisticRegression()
            self.cls_model.fit(X, y)



[docs]
    def predict(self, X, verbose=0):
        """Predict class labels for the input samples with reduced features of
        n_reduced using the fitted logistic regression classification model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        verbose : int, optional, default=0
            Verbosity level.

        Returns
        -------
        y_pred : array-like of shape (n_samples,)
            Predicted class labels.
        """
        X = self.__transform__(X)
        if self.best_model is not None:
            X = self.best_model.transform(X, rd=self.round)
        return self.cls_model.predict(X=X)



[docs]
    def score(self, X, y, sample_weight=None, verbose=0):
        """Compute the likelihood score of the GMM model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        y : array-like of shape (n_samples,)
            Target vector.

        sample_weight : array-like of shape (n_samples,), optional
            Sample weights.

        verbose : int, optional, default=0
            Verbosity level.

        Returns
        -------
        score : float
            The score of the model based on likelihood.
        """
        X = self.__transform__(X, y)
        try:
            aic, bic, likelihood, n_components = self.__get_goodnessof_fit__(
                self.best_model.gmm, X, y
            )
            mi_mean, _ = (
                self.best_model.get_info().values[0][1],
                self.best_model.get_info().values[0][2],
            )
            mi = np.max([mi_mean, 0.0]) * np.log2(np.e)
            self.logger.info(
                f"MI {np.around(mi, 4)}  AIC {np.around(aic, 4)} BIC {np.around(bic, 4)} "
                f"Likelihood {np.around(likelihood, 4)} n_components {n_components}"
            )
            score = likelihood
            self.logger.debug(f"Best Model is not None out of {self.n_models} score {score}")
        except Exception as error:
            self.logger.debug("Best Model is None")
            log_exception_error(self.logger, error)
            score = -1000000
        return score



[docs]
    def predict_proba(self, X, verbose=0):
        """Predict class labels for the input samples with reduced features of
        n_reduced using the fitted logistic regression classification model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        verbose : int, optional, default=0
            Verbosity level.

        Returns
        -------
        y_pred : array-like of shape (n_samples,)
            Predicted class labels.
        """
        X = self.__transform__(X)
        if self.best_model is not None:
            X = self.best_model.transform(X, rd=self.round)
        y_pred = self.cls_model.predict_proba(X=X)
        return y_pred



[docs]
    def decision_function(self, X, verbose=0):
        """Predict confidence scores for samples, which is proportional to the
        signed distance of that sample to the hyperplane.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        verbose : int, optional, default=0
            Verbosity level.

        Returns
        -------
        decision : array-like of shape (n_samples,)
            Decision function values.
        """
        X = self.__transform__(X)
        if self.best_model is not None:
            X = self.best_model.transform(X, rd=self.round)
        return self.cls_model.decision_function(X=X)



[docs]
    def estimate_mi(self, X, y, verbose=0, **kwd):
        """Estimate mutual information using the best fitted GMM model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix.

        y : array-like of shape (n_samples,)
            Target vector.

        verbose : int, optional, default=0
            Verbosity level.

        **kwd : dict, optional
            Additional keyword arguments.

        Returns
        -------
        mi_estimated : float
            Estimated mutual information.
        """
        X = self.__transform__(X, y)
        iter_ = 0
        while True:
            try:
                iter_ += 1
                select = SelectVars(self.best_gmm_model, selection_mode="backward")
                select.fit(X, y, verbose=verbose, eps=np.finfo(np.float32).eps)
                mi_mean, _ = (
                    select.get_info().values[0][1],
                    select.get_info().values[0][2],
                )
                mi_estimated = np.nanmax([mi_mean, 0.0]) * np.log2(np.e)
                if verbose:
                    print(f"Model Number: {iter_}, Estimated MI: {mi_estimated}")
                self.logger.info(f"Model Number: {iter_}, Estimated MI: {mi_estimated}")
            except Exception as error:
                log_exception_error(self.logger, error)
                self.logger.error(f"Model {iter_} was not valid re-estimating it")
                mi_estimated = np.nan
            if np.isnan(mi_estimated) or np.isinf(mi_estimated):
                self.logger.error(f"Nan MI Re-estimating")
            else:
                break
            if iter_ > 100:
                if np.isnan(mi_estimated) or np.isinf(mi_estimated):
                    self.logger.error(f"Setting Mi to 0")
                    mi_estimated = 0.0
                break
        return mi_estimated