Source code for autoqild.classifiers.bayes_predictor

"""A Bayes-optimal classifier leveraging the underlying probability
distribution of the dataset for predictions."""

import logging

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_random_state

from ..bayes_search import get_scores
from ..utilities import normalize


[docs] class BayesPredictor(BaseEstimator, ClassifierMixin): """A Bayes-optimal classifier that predicts on the given dataset using the defined joint and conditional distributions. This classifier leverages a dataset object, used to generate underlying data, which represents the best-performing classifier. This class stores the PDFs and predicts class probabilities and labels given the input features. Parameters ---------- dataset_obj : object An object representing the dataset. This object should provide methods like `generate_dataset`, `get_prob_y_given_x`, and `get_prob_flip_y_given_x`. random_state : int or None, optional, default=None Random state for reproducibility. **kwargs : dict, optional Additional keyword arguments. Attributes ---------- dataset_obj : object The dataset object provided during initialization. Used for generating datasets and computing class probabilities. random_state : RandomState Random state instance for reproducibility. logger : logging.Logger Logger instance for logging information. n_classes : int or None Number of classes in the classification data samples. Set during the `fit` method. """ def __init__(self, dataset_obj, random_state=None, **kwargs): self.dataset_obj = dataset_obj self.random_state = check_random_state(random_state) self.logger = logging.getLogger(BayesPredictor.__name__) self.n_classes = None
[docs] def fit(self, X, y, **kwd): """Fit the BayesPredictor model. This method sets the number of classes in the training data but does not perform any actual fitting. It is intended to be overridden or expanded in a subclass. Parameters ---------- X : array-like of shape (n_samples, n_features) The training input samples. y : array-like of shape (n_samples,) The target values (class labels). **kwd : dict, optional Additional keyword arguments. """ self.n_classes = len(np.unique(y)) return self
[docs] def predict(self, X, verbose=0): """Predict class labels for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. verbose : int, optional, default=0 Verbosity level. Returns ------- y_pred : array-like of shape (n_samples,) Predicted class labels. """ pred_probabilities = self.predict_proba(X=X, verbose=verbose) y_pred = pred_probabilities.argmax(axis=1) return y_pred
[docs] def score(self, X, y, sample_weight=None, verbose=0): """Compute the accuracy of the predictions. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. y : array-like of shape (n_samples,) The true labels. sample_weight : array-like of shape (n_samples,), optional Sample weights. verbose : int, optional, default=0 Verbosity level. Returns ------- accuracy_score : float The accuracy score. """ y_pred = self.predict(X) accuracy_score = np.mean(y_pred == y) return accuracy_score
[docs] def decision_function(self, X, verbose=0): """Compute the decision function for the input samples. The decision function returns the probability estimates of the positive class for binary classification problems. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. verbose : int, optional, default=0 Verbosity level. Returns ------- scores : array-like of shape (n_samples,) The decision function values. """ scores = self.predict_proba(X) if self.n_classes == 2: scores = scores[:, 1] return scores
[docs] def predict_proba(self, X, verbose=0): """Predict class probabilities for the input samples. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. verbose : int, optional, default=0 Verbosity level. Returns ------- p_pred : array-like of shape (n_samples, n_classes) Predicted class probabilities. """ p_pred = np.zeros((X.shape[0], self.dataset_obj.n_classes)) for k_class in self.dataset_obj.class_labels: if self.dataset_obj.flip_y == 0.0: p_pred[:, k_class] = self.dataset_obj.get_prob_y_given_x(X=X, class_label=k_class) else: p_pred[:, k_class] = self.dataset_obj.get_prob_flip_y_given_x( X=X, class_label=k_class ) p_pred = normalize(p_pred, axis=1) return p_pred
[docs] def get_bayes_predictor_scores(self): """Generate datasets and evaluate the accuracy of the Bayes predictor. This method generates multiple datasets and evaluates the accuracy of the Bayes predictor on each one. It returns the true and predicted labels along with the prediction probabilities for the dataset that achieved the highest accuracy. Returns ------- y_true : array-like of shape (n_samples,) The true labels for the dataset with the highest accuracy. y_pred : array-like of shape (n_samples,) The predicted labels for the dataset with the highest accuracy. p_pred : array-like of shape (n_samples, n_classes) The predicted probabilities for the dataset with the highest accuracy. """ max_acc = -np.inf y_true = None y_pred = None p_pred = None for i in range(100): X, y = self.dataset_obj.generate_dataset() pred = self.predict(X) acc_bp = np.mean(pred == y) if acc_bp > max_acc: self.logger.info(f"Accuracy of Bayes Predictor is {acc_bp}") max_acc = acc_bp y_true = np.copy(y) p_pred, y_pred = get_scores(X, self) return y_true, y_pred, p_pred