Source code for autoqild.utilities.utils

"""Implements several utility functions for array normalization, logging
exceptions, managing HDF5 files, and creating directories safely."""

import os
import sys
import traceback
import h5py
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import warnings

warnings.filterwarnings("ignore")

__all__ = [
    "logsumexp",
    "softmax",
    "sigmoid",
    "normalize",
    "progress_bar",
    "print_dictionary",
    "standardize_features",
    "create_directory_safely",
    "log_exception_error",
    "check_and_delete_corrupt_h5_file",
]


[docs] def logsumexp(x, axis=1): """Compute the log of the sum of exponentials of input elements. Parameters ---------- x : array-like Input array. axis : int, optional Axis along which the sum is computed. Default is 1. Returns ------- lsum_exp : array-like An array with the log of the sum of exponentials of elements along the specified axis. """ max_x = x.max(axis=axis, keepdims=True) lsum_exp = max_x + np.log(np.sum(np.exp(x - max_x), axis=axis, keepdims=True)) return lsum_exp
[docs] def softmax(x, axis=1): """Compute the softmax of input elements. Parameters ---------- x : array-like Input array. axis : int, optional Axis along which the softmax is computed. Default is 1. Returns ------- s_values : array-like An array with the softmax applied along the specified axis. """ lse = logsumexp(x, axis=axis) s_values = np.exp(x - lse) return s_values
[docs] def sigmoid(x): """Compute the sigmoid of the input array. Parameters ---------- x : array-like Input array. Returns ------- x : array-like The sigmoid of the input array. """ x = 1.0 / (1.0 + np.exp(-x)) return x
[docs] def normalize(x, axis=1): """Normalize the input array along the specified axis. Parameters ---------- x : array-like Input array to normalize. axis : int, optional Axis along which the normalization is applied. Default is 1. Returns ------- normed : array-like The normalized array. """ normed = x / np.sum(x, axis=axis, keepdims=True) return normed
[docs] def progress_bar(count, total, status=""): """Display a progress bar in the console. Parameters ---------- count : int Current progress count. total : int Total count for completion. status : str, optional A status message to display along with the progress bar. """ bar_len = 60 filled_len = int(round(bar_len * count / float(total))) bar = "=" * filled_len + "-" * (bar_len - filled_len) sys.stdout.write("[%s] %s/%s ...%s\r" % (bar, count, total, status)) sys.stdout.flush()
[docs] def log_exception_error(logger, e): """Log an exception with traceback details. Parameters ---------- logger : logging.Logger Logger instance to log the error. e : Exception Exception instance to log. """ if hasattr(e, "message"): message = e.message else: message = e logger.error(traceback.format_exc()) logger.error(message)
[docs] def create_directory_safely(path, is_file_path=False): """Create a directory if it does not exist, handling potential errors safely. Parameters ---------- path : str Path to the directory or file. is_file_path : bool, optional If True, considers "path" as a file path and creates the directory containing the file. """ try: if is_file_path: path = os.path.dirname(path) if not os.path.exists(path): os.makedirs(path, exist_ok=True) except Exception as e: print(str(e))
[docs] def check_and_delete_corrupt_h5_file(file_path, logger): """Check if an HDF5 file is corrupt and delete it if necessary. Parameters ---------- file_path : str Path to the HDF5 file. logger : logging.Logger Logger instance to log actions. """ basename = os.path.basename(file_path) if os.path.exists(file_path): try: if os.path.getsize(file_path) == 0: logger.info(f"The file '{basename}' is empty.") os.remove(file_path) logger.info(f"The file '{basename}' has been deleted.") return with h5py.File(file_path, "r") as h5_file: group_names = list(h5_file.keys()) if group_names: group_name = group_names[0] group = h5_file[group_name] logger.info( f"The first group '{group_name}' in the file '{basename}' has been " f"accessed successfully." ) else: logger.info(f"No groups found in the file '{basename}'.") logger.info(f"The file '{basename}' is not corrupt.") except (OSError, KeyError, ValueError, Exception) as error: log_exception_error(logger, error) logger.error(f"The file '{basename}' is corrupt.") os.remove(file_path) logger.error(f"The file '{basename}' has been deleted.") else: logger.info(f"File does not exist '{basename}'")
[docs] def standardize_features(x_train, x_test, scaler=RobustScaler, scaler_params={}): """ Standardize the features in the training and test sets using the specified scaler. The function offers flexibility to choose between `StandardScaler`, `RobustScaler`, and `MinMaxScaler`. It allows customization of the chosen scalerโ€™s parameters using a dictionary and raises a ValueError if an unsupported scaler is passed. Parameters ---------- x_train : array-like of shape (n_samples, n_features) Training set features. x_test : array-like of shape (n_samples, n_features) Test set features. scaler : {StandardScaler, RobustScaler, MinMaxScaler}, optional, default=RobustScaler The scaling class to be used for standardization. Choose from: - StandardScaler: Standardize features by removing the mean and scaling to unit variance. - RobustScaler: Scale features using statistics that are robust to outliers. - MinMaxScaler: Scale features to a given range (usually between 0 and 1). scaler_params : dict, optional, default={} Parameters to be passed to the selected scaler. Example: {'with_mean': False} for `StandardScaler`. Returns ------- x_train : array-like of shape (n_samples, n_features) Standardized training set features. x_test : array-like of shape (n_samples, n_features) Standardized test set features. Raises ------ ValueError If the specified scaler is not one of `StandardScaler`, `RobustScaler`, or `MinMaxScaler`. Example ------- >>> from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler >>> import numpy as np >>> x_train = np.array([[1, 2], [2, 3], [3, 4]]) >>> x_test = np.array([[4, 5], [5, 6]]) # Example with StandardScaler and a custom parameter >>> scaler_params = {'with_mean': False} >>> x_train_scaled, x_test_scaled = standardize_features( ... x_train, x_test, scaler=StandardScaler, scaler_params=scaler_params ... ) # Example with RobustScaler (default) >>> x_train_scaled, x_test_scaled = standardize_features(x_train, x_test, scaler=RobustScaler) # Example with MinMaxScaler >>> x_train_scaled, x_test_scaled = standardize_features(x_train, x_test, scaler=MinMaxScaler) # Example with an invalid scaler (this will raise a ValueError) >>> try: ... x_train_scaled, x_test_scaled = standardize_features(x_train, x_test, scaler="InvalidScaler") ... except ValueError as e: ... print(e) 'Invalid scaler specified. Choose from StandardScaler, RobustScaler, or MinMaxScaler.' """ if scaler not in [StandardScaler, RobustScaler, MinMaxScaler]: raise ValueError( "Invalid scaler specified. Choose from StandardScaler, RobustScaler, or MinMaxScaler." ) # Initialize the chosen scaler with the specified parameters scaler_instance = scaler(**scaler_params) # Fit the scaler on the training data and transform both training and test data x_train = scaler_instance.fit_transform(x_train) x_test = scaler_instance.transform(x_test) return x_train, x_test