Source code for autoqild.bayes_search.bayes_search_cv

"""Implements the main `BayesSearchCV` class, which orchestrates the Bayesian
optimization process extending the functionality of `BayesSearchCV` from the
`scikit-optimize` library."""

import logging

import dill
import numpy as np
from sklearn.utils import check_random_state
from skopt import BayesSearchCV as BayesSearchCVSK
from skopt.utils import eval_callbacks, point_asdict

from ..utilities import log_exception_error

__all__ = ["BayesSearchCV"]



[docs]
class BayesSearchCV(BayesSearchCVSK):
    """BayesSearchCV is a custom implementation of Bayesian optimization-based
    hyperparameter tuning, extending the functionality of `BayesSearchCV` from
    the `scikit-optimize` library. This class facilitates efficient exploration
    of hyperparameter spaces to identify the best-performing model
    configurations.

    This implementation provides additional functionality for logging, handling optimizer states,
    and saving optimization progress to a file, enabling resumption of interrupted searches.

    Attributes
    ----------
    logger : logging.Logger
        Logger instance used for logging the optimization process and any errors encountered.

    optimizers_file_path : str
        Path to the file where the optimizer states are saved. This allows for resuming optimization
        from where it was left off in case of interruptions.

    Parameters
    ----------
    estimator : estimator object
        The object to use to fit the data.

    search_spaces : dict, list of dict or list of tuple
        The search space for the hyperparameters.

    optimizer_kwargs : dict, optional
        Additional arguments for the optimizer.

    n_iter : int, default=50
        Number of parameter settings that are sampled.

    scoring : string, callable or None, default=None
        A single string or a callable to evaluate the predictions on the test set.

    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.

    n_jobs : int, default=1
        Number of jobs to run in parallel.

    n_points : int, default=1
        Number of parameter settings to sample in parallel.

    iid : boolean, default=True
        If True, return the average score across folds.

    refit : boolean, default=True
        Refit the best estimator with the entire dataset.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.

    verbose : int, default=0
        Controls the verbosity.

    pre_dispatch : int or string, default=`2*n_jobs`
        Controls the number of jobs that get dispatched during parallel execution.

    random_state : int, RandomState instance or None, optional
        Controls the randomness of the estimator.

    error_score : `raise` or numeric, default=`raise`
        Value to assign to the score if an error occurs.

    return_train_score : boolean, default=False
        If False, the results attribute will not include training scores.

    optimizers_file_path : string, default=`results.pkl`
        Path to save the optimizer states.


    Private Methods
    ---------------
    _step(search_space, optimizer, evaluate_candidates, n_points=1)
        Generates parameter combinations and evaluates them in parallel.

    _run_search(evaluate_candidates)
        Runs the search process to find the best hyperparameters by iteratively evaluating different
        configurations based on the Bayesian optimization strategy.
    """

    def __init__(self, estimator, search_spaces, optimizer_kwargs=None, n_iter=50, scoring=None, fit_params=None, n_jobs=1, n_points=1, iid=True, refit=True,
        cv=None,
        verbose=0,
        pre_dispatch="2*n_jobs",
        random_state=None,
        error_score="raise",
        return_train_score=False,
        optimizers_file_path="results.pkl",
    ):
        super().__init__(
            estimator,
            search_spaces,
            optimizer_kwargs,
            n_iter,
            scoring,
            fit_params,
            n_jobs,
            n_points,
            iid,
            refit,
            cv,
            verbose,
            pre_dispatch,
            random_state,
            error_score,
            return_train_score,
        )
        self.optimizers_file_path = optimizers_file_path
        self.logger = logging.getLogger(BayesSearchCV.__name__)

    def _step(self, search_space, optimizer, evaluate_candidates, n_points=1):
        # get parameter values to evaluate
        params = optimizer.ask(n_points=n_points)

        # convert parameters to python native types
        params = [[np.array(v).item() for v in p] for p in params]

        # make lists into dictionaries
        params_dict = [point_asdict(search_space, p) for p in params]
        self.logger.info(f"Parameters values to be tested {params}")
        try:
            all_results = evaluate_candidates(params_dict)
            local_results = all_results["mean_test_score"][-len(params) :]
        except Exception as e:
            local_results = list(np.zeros(len(params)))
            self.logger.info(params_dict)
            log_exception_error(self.logger, e)
        # Feed the point and objective value back into optimizer
        # Optimizer minimizes objective, hence provide negative score

        return optimizer.tell(params, [-score for score in local_results])

    def _run_search(self, evaluate_candidates):
        # check if space is a single dict, convert to list if so
        search_spaces = self.search_spaces
        if isinstance(search_spaces, dict):
            search_spaces = [search_spaces]

        callbacks = self._callbacks

        random_state = check_random_state(self.random_state)
        self.optimizer_kwargs_["random_state"] = random_state

        # Instantiate optimizers for all the search spaces.
        try:
            optimizers, optim_results = dill.load(open(self.optimizers_file_path, "rb"))
        except Exception as error:
            log_exception_error(self.logger, error)
            self.logger.error(f"No such file or directory: {self.optimizers_file_path}")
            optimizers = None
            optim_results = []
        if optimizers is None:
            optimizers = []
            for search_space in search_spaces:
                if isinstance(search_space, tuple):
                    search_space = search_space[0]
                optimizers.append(self._make_optimizer(search_space))
            self.optimizers_ = optimizers  # will save the states of the optimizers
            self._optim_results = [0 for o in optimizers]
        else:
            self._optim_results = optim_results
            self.optimizers_ = optimizers

        n_points = self.n_points

        for i, (search_space, optimizer) in enumerate(zip(search_spaces, optimizers)):
            # if not provided with search subspace, n_iter is taken as
            # self.n_iter
            if isinstance(search_space, tuple):
                search_space, n_iter = search_space
            else:
                n_iter = self.n_iter
            n_finished = len(optimizer.yi)
            n_iter = n_iter - n_finished
            self.logger.info(
                f"Iterations already done: {n_finished} and running iterations {n_iter}"
            )
            # do the optimization for particular search space
            optim_result = None
            iter_idx = 0
            while n_iter > 0:
                # when n_iter < n_points points left for evaluation
                n_points_adjusted = min(n_iter, n_points)
                iter_idx += n_points
                self.logger.info(f"The {iter_idx + n_finished}th parameter values are being tested")
                try:
                    optim_result = self._step(
                        search_space,
                        optimizer,
                        evaluate_candidates,
                        n_points=n_points_adjusted,
                    )
                except Exception as error:
                    log_exception_error(self.logger, error)
                    self.logger.info(f"Cannot evaluate the points {n_points_adjusted}")
                n_iter -= n_points
                if eval_callbacks(callbacks, optim_result):
                    break
                self._optim_results[i] = optim_result
                dill.dump(
                    (self.optimizers_, self._optim_results),
                    open(self.optimizers_file_path, "wb"),
                )
            if optim_result is not None:
                self._optim_results[i] = optim_result
            dill.dump(
                (self.optimizers_, self._optim_results),
                open(self.optimizers_file_path, "wb"),
            )