Source code for autoqild.bayes_search.bayes_search_cv

"""Implements the main `BayesSearchCV` class, which orchestrates the Bayesian
optimization process extending the functionality of `BayesSearchCV` from the
`scikit-optimize` library."""

import logging

import dill
import numpy as np
from sklearn.utils import check_random_state
from skopt import BayesSearchCV as BayesSearchCVSK
from skopt.utils import eval_callbacks, point_asdict

from ..utilities import log_exception_error

__all__ = ["BayesSearchCV"]


[docs] class BayesSearchCV(BayesSearchCVSK): """BayesSearchCV is a custom implementation of Bayesian optimization-based hyperparameter tuning, extending the functionality of `BayesSearchCV` from the `scikit-optimize` library. This class facilitates efficient exploration of hyperparameter spaces to identify the best-performing model configurations. This implementation provides additional functionality for logging, handling optimizer states, and saving optimization progress to a file, enabling resumption of interrupted searches. Attributes ---------- logger : logging.Logger Logger instance used for logging the optimization process and any errors encountered. optimizers_file_path : str Path to the file where the optimizer states are saved. This allows for resuming optimization from where it was left off in case of interruptions. Parameters ---------- estimator : estimator object The object to use to fit the data. search_spaces : dict, list of dict or list of tuple The search space for the hyperparameters. optimizer_kwargs : dict, optional Additional arguments for the optimizer. n_iter : int, default=50 Number of parameter settings that are sampled. scoring : string, callable or None, default=None A single string or a callable to evaluate the predictions on the test set. fit_params : dict, optional Parameters to pass to the fit method of the estimator. n_jobs : int, default=1 Number of jobs to run in parallel. n_points : int, default=1 Number of parameter settings to sample in parallel. iid : boolean, default=True If True, return the average score across folds. refit : boolean, default=True Refit the best estimator with the entire dataset. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. verbose : int, default=0 Controls the verbosity. pre_dispatch : int or string, default=`2*n_jobs` Controls the number of jobs that get dispatched during parallel execution. random_state : int, RandomState instance or None, optional Controls the randomness of the estimator. error_score : `raise` or numeric, default=`raise` Value to assign to the score if an error occurs. return_train_score : boolean, default=False If False, the results attribute will not include training scores. optimizers_file_path : string, default=`results.pkl` Path to save the optimizer states. Private Methods --------------- _step(search_space, optimizer, evaluate_candidates, n_points=1) Generates parameter combinations and evaluates them in parallel. _run_search(evaluate_candidates) Runs the search process to find the best hyperparameters by iteratively evaluating different configurations based on the Bayesian optimization strategy. """ def __init__(self, estimator, search_spaces, optimizer_kwargs=None, n_iter=50, scoring=None, fit_params=None, n_jobs=1, n_points=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch="2*n_jobs", random_state=None, error_score="raise", return_train_score=False, optimizers_file_path="results.pkl", ): super().__init__( estimator, search_spaces, optimizer_kwargs, n_iter, scoring, fit_params, n_jobs, n_points, iid, refit, cv, verbose, pre_dispatch, random_state, error_score, return_train_score, ) self.optimizers_file_path = optimizers_file_path self.logger = logging.getLogger(BayesSearchCV.__name__) def _step(self, search_space, optimizer, evaluate_candidates, n_points=1): # get parameter values to evaluate params = optimizer.ask(n_points=n_points) # convert parameters to python native types params = [[np.array(v).item() for v in p] for p in params] # make lists into dictionaries params_dict = [point_asdict(search_space, p) for p in params] self.logger.info(f"Parameters values to be tested {params}") try: all_results = evaluate_candidates(params_dict) local_results = all_results["mean_test_score"][-len(params) :] except Exception as e: local_results = list(np.zeros(len(params))) self.logger.info(params_dict) log_exception_error(self.logger, e) # Feed the point and objective value back into optimizer # Optimizer minimizes objective, hence provide negative score return optimizer.tell(params, [-score for score in local_results]) def _run_search(self, evaluate_candidates): # check if space is a single dict, convert to list if so search_spaces = self.search_spaces if isinstance(search_spaces, dict): search_spaces = [search_spaces] callbacks = self._callbacks random_state = check_random_state(self.random_state) self.optimizer_kwargs_["random_state"] = random_state # Instantiate optimizers for all the search spaces. try: optimizers, optim_results = dill.load(open(self.optimizers_file_path, "rb")) except Exception as error: log_exception_error(self.logger, error) self.logger.error(f"No such file or directory: {self.optimizers_file_path}") optimizers = None optim_results = [] if optimizers is None: optimizers = [] for search_space in search_spaces: if isinstance(search_space, tuple): search_space = search_space[0] optimizers.append(self._make_optimizer(search_space)) self.optimizers_ = optimizers # will save the states of the optimizers self._optim_results = [0 for o in optimizers] else: self._optim_results = optim_results self.optimizers_ = optimizers n_points = self.n_points for i, (search_space, optimizer) in enumerate(zip(search_spaces, optimizers)): # if not provided with search subspace, n_iter is taken as # self.n_iter if isinstance(search_space, tuple): search_space, n_iter = search_space else: n_iter = self.n_iter n_finished = len(optimizer.yi) n_iter = n_iter - n_finished self.logger.info( f"Iterations already done: {n_finished} and running iterations {n_iter}" ) # do the optimization for particular search space optim_result = None iter_idx = 0 while n_iter > 0: # when n_iter < n_points points left for evaluation n_points_adjusted = min(n_iter, n_points) iter_idx += n_points self.logger.info(f"The {iter_idx + n_finished}th parameter values are being tested") try: optim_result = self._step( search_space, optimizer, evaluate_candidates, n_points=n_points_adjusted, ) except Exception as error: log_exception_error(self.logger, error) self.logger.info(f"Cannot evaluate the points {n_points_adjusted}") n_iter -= n_points if eval_callbacks(callbacks, optim_result): break self._optim_results[i] = optim_result dill.dump( (self.optimizers_, self._optim_results), open(self.optimizers_file_path, "wb"), ) if optim_result is not None: self._optim_results[i] = optim_result dill.dump( (self.optimizers_, self._optim_results), open(self.optimizers_file_path, "wb"), )