Source code for autoqild.detectors.random_forest_leakage_detector

"""A leakage detector that utilizes RandomForest models for robust and
interpretable detection."""

from sklearn.ensemble import RandomForestClassifier

from .sklearn_leakage_detector import SklearnLeakageDetector

__all__ = ["RandomForestLeakageDetector"]

from ..utilities import LOG_LOSS_MI_ESTIMATION


[docs] class RandomForestLeakageDetector(SklearnLeakageDetector): """RandomForestLeakageDetector class for detecting information leakage using a Random Forest model. This class extends `SklearnLeakageDetector` to detect information leakage using a Random Forest classifier as the base model. The Random Forest model is well-suited for leakage detection due to its ability to handle complex feature interactions and its inherent randomness. This class also supports hyperparameter optimization and cross-validation. Parameters ---------- padding_name : str The name of the padding method used in the experiments to obscure or detect leakage. learner_params : dict Parameters related to the Random Forest model used in the detection process. fit_params : dict Parameters passed to the `fit` method during model training. hash_value : str A unique hash value used to identify and manage result files for a specific experiment. cv_iterations : int The number of cross-validation iterations to perform during model evaluation. n_hypothesis : int The number of hypotheses or models to be tested for leakage. base_directory : str The base directory where result files, logs, and backups are stored. search_space : dict The hyperparameter search space for Bayesian optimization. hp_iters : int The number of iterations for hyperparameter optimization. n_inner_folds : int The number of folds for inner cross-validation during hyperparameter optimization. validation_loss : str The loss function used to evaluate the performance of models during cross-validation. random_state : int or RandomState instance, optional Controls the randomness for reproducibility, ensuring consistent results across different runs. **kwargs : dict, optional Additional keyword arguments passed to the parent class. """ def __init__( self, padding_name, learner_params, fit_params, hash_value, cv_iterations, n_hypothesis, base_directory, search_space, hp_iters, n_inner_folds, validation_loss, random_state=None, **kwargs ): if "n_classes" in learner_params.keys(): del learner_params["n_classes"] if "n_features" in learner_params.keys(): del learner_params["n_features"] super().__init__( padding_name=padding_name, learner_params=learner_params, fit_params=fit_params, hash_value=hash_value, cv_iterations=cv_iterations, n_hypothesis=n_hypothesis, base_directory=base_directory, search_space=search_space, hp_iters=hp_iters, n_inner_folds=n_inner_folds, validation_loss=validation_loss, random_state=random_state, **kwargs ) self.n_jobs = 8 self.base_detector = RandomForestClassifier
[docs] def hyperparameter_optimization(self, X, y): """Performs Bayesian hyperparameter optimization to identify the best model parameters. This method uses a Bayesian search strategy to explore a predefined hyperparameter search space and selects the optimal configuration based on the specified validation loss. The method performs cross-validation within the search to ensure that the selected hyperparameters generalize well. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data to be used for training during hyperparameter optimization. y : array-like of shape (n_samples,) The target values (class labels) corresponding to X. Returns ------- int The size of the training dataset after reduction (if applicable). Raises ------ Exception If an error occurs during the Bayesian search fitting process. """ return super().hyperparameter_optimization(X, y)
[docs] def fit(self, X, y): """Fits the model using cross-validation and performs hyperparameter optimization. This method first checks if the model has already been fitted. If not, it runs the hyperparameter optimization process followed by cross-validation on the specified number of hypotheses. The model is trained using a stratified split of the dataset, and results are evaluated using predefined metrics. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data used for training the models. y : array-like of shape (n_samples,) The target values (class labels) corresponding to X. Notes ----- During fitting, random classifier and majority voting classifier performance is also calculated for comparison. """ super().fit(X, y)
[docs] def evaluate_scores(self, X_test, X_train, y_test, y_train, y_pred, p_pred, model, n_model): """Evaluate and store model performance metrics for the detection process. This method computes various evaluation metrics, such as log-loss, accuracy, and confusion matrix, for the model`s predictions. It also supports probability calibration using techniques like isotonic regression and Platt scaling. The results are stored and logged for further analysis. Parameters ---------- X_test : array-like of shape (n_samples, n_features) The feature matrix for the test set. X_train : array-like of shape (n_samples, n_features) The feature matrix for the training set. y_test : array-like of shape (n_samples,) The true target labels for the test data. y_train : array-like of shape (n_samples,) The true target labels for the training data. y_pred : array-like of shape (n_samples,) The predicted target labels for the test set. p_pred : array-like of shape (n_samples, n_classes) The predicted class probabilities for the test data. model : object The trained model being evaluated. n_model : int The index of the model in the list of evaluated models. """ super().evaluate_scores( X_test=X_test, X_train=X_train, y_test=y_test, y_train=y_train, y_pred=y_pred, p_pred=p_pred, model=model, n_model=n_model, )
[docs] def detect(self, detection_method="log_loss_mi"): """Executes the detection process to identify potential information leakage using the specified method. Parameters ---------- detection_method : str The method to use for detecting information leakage. Options include: - `paired-t-test`: Uses paired t-test to compare the accuracy of models against the majority voting baseline. - `paired-t-test-random`: Uses paired t-test to compare the accuracy of models against a random classifier. - `fishers-exact-mean`: Applies Fisher's Exact Test on the confusion matrix and computes the mean p-value. - `fishers-exact-median`: Applies Fisher's Exact Test on the confusion matrix and computes the median p-value. - `mid_point_mi`: Detects leakage using the midpoint mutual information estimation. - `log_loss_mi`: Detects leakage using log loss mutual information estimation. - `log_loss_mi_isotonic_regression`: Uses log loss mutual information estimation with isotonic regression calibration. - `log_loss_mi_platt_scaling`: Uses log loss mutual information estimation with Platt scaling calibration. - `log_loss_mi_beta_calibration`: Uses log loss mutual information estimation with beta calibration. - `log_loss_mi_temperature_scaling`: Uses log loss mutual information estimation with temperature scaling. - `log_loss_mi_histogram_binning`: Uses log loss mutual information estimation with histogram binning. - `p_c_softmax_mi`: Uses PC-Softmax mutual information estimation for detection. Returns ------- detection_decision : bool Indicates whether any models showed significant leakage. hypothesis_rejected : int The number of models flagged for leakage. Notes ----- The method implements a Holm-Bonferroni correction to control the family-wise error rate for multiple models. """ return super().detect(detection_method=detection_method)