Source code for autoqild.dataset_readers.utils

"""Provides utility functions for dataset handling, operations, and
preprocessing."""

import logging

import numpy as np

__all__ = ["GEN_TYPES", "FACTOR", "LABEL_COL", "generate_samples_per_class", "clean_class_label", "pdf"]

GEN_TYPES = ["single", "multiple"]
"""
    List of supported generation types for class imbalance:
    
    - `single`: Imbalance is applied to one class.
    - `multiple`: Imbalance is distributed across multiple classes.
"""

FACTOR = 1.5
"""A constant factor used for scaling or other operations."""

LABEL_COL = "label"
"""Default label column name used in datasets."""



[docs]
def generate_samples_per_class(n_classes, samples=1000, imbalance=0.05, gen_type="single", logger=None, verbose=1):
    """Generate the number of samples per class with a specified imbalance.

    This function calculates the number of samples for each class based on the provided imbalance ratio and the generation type.
    It supports both binary and multi-class scenarios, allowing the user to specify whether the imbalance should be distributed
    across a single class or multiple classes.

    Parameters
    ----------
    n_classes : int
        The number of classes in the dataset.

    samples : int, default=1000
        The total number of samples across all classes.

    imbalance : float, default=0.05
        The proportion of samples in the minority class (or classes if `gen_type` is "multiple"). The value must be less than or equal to 1/n_classes.

    gen_type : str, default="single"
        The type of imbalance generation:
        - "single": Imbalance is applied to one class.
        - "multiple": Imbalance is distributed across multiple classes.

    logger : logging.Logger, optional
        Logger object for logging output. If None, a default logger is created.

    verbose : int, default=1
        Verbosity level. If 1, logging information is displayed.

    Returns
    -------
    samples_per_class : dict
        A dictionary where the keys are class labels (as strings) and the values are the number of samples for each class.

    Raises
    ------
    ValueError
        If the imbalance ratio is greater than 1/n_classes or if the generation type is not recognized.
    """
    if logger is None:
        logger = logging.getLogger("Generate Samples")
    if verbose:
        logger.info(
            "###############################################################################"
        )
    if imbalance > 1 / n_classes:
        raise ValueError(
            f"The imbalance {np.around(imbalance, 2)} for a class cannot be more than uniform {1 / n_classes}"
        )
    if gen_type not in GEN_TYPES:
        raise ValueError(f"Generation type {gen_type} not defined {GEN_TYPES}")
    assert (n_classes == 2) == (gen_type == "single") or n_classes > 2
    samples_per_class = {}
    n_total_instances = samples * n_classes
    if gen_type == "single":
        for n_c in range((n_classes - 1)):
            imb = (1 - imbalance) / (n_classes - 1)
            n_samples = imb * n_total_instances
            samples_per_class[str(n_c)] = int(np.ceil(n_samples))
            if verbose:
                logger.info(f"Class {n_c + 1} calculated {n_samples / n_total_instances}")
        samples_per_class[str(n_classes - 1)] = n_total_instances - sum(samples_per_class.values())
        v = samples_per_class[str(n_classes - 1)] / n_total_instances
        if verbose:
            logger.info(f"Class {n_classes} calculated {np.around(v, 2)}")
    if gen_type == "multiple":
        for n_c in range((n_classes - 1)):
            n_samples = imbalance * n_total_instances
            samples_per_class[str(n_c)] = int(np.ceil(n_samples))
            if verbose:
                logger.info(f"Class {n_c + 1} calculated {n_samples / n_total_instances}")
        samples_per_class[str(n_classes - 1)] = n_total_instances - sum(samples_per_class.values())
        v = samples_per_class[str(n_classes - 1)] / n_total_instances
        if verbose:
            logger.info(f"Class {n_classes} calculated {np.around(v, 2)}")
    if verbose:
        logger.info(f"Imbalanced {np.around(imbalance, 2)} samples_per_class {samples_per_class}")
    return samples_per_class




[docs]
def clean_class_label(string):
    """Clean and format a class label string.

    This function processes a string by replacing underscores with spaces, capitalizing each word,
    and removing any extra spaces to make the label more readable and formatted consistently.

    Parameters
    ----------
    string : str
        The input class label string to be cleaned and formatted.

    Returns
    -------
    str
        The cleaned and formatted class label string.

    Example
    -------
    >>> clean_class_label("class_label_example")
    `Class Label Example`

    Notes
    -----
    This function is useful for formatting class labels in a readable way, especially when they are
    generated automatically or retrieved from a source where they are not human-readable.
    """
    string = " ".join(string.split("_")).title()
    string = string.replace("  ", " ")
    return string




[docs]
def pdf(dist, x):
    """Compute the probability density function (PDF) for the given
    distribution and input data.

    Parameters
    ----------
    dist : scipy.stats._multivariate.multivariate_normal_frozen
        The multivariate normal distribution object.
    x : array-like of shape (n_samples, n_features)
        Input data for which the PDF is computed.

    Returns
    -------
    log_dist_samples: array-like
        Probability density values for the input data.
    """
    log_dist_samples = np.exp(dist.logpdf(x))
    return log_dist_samples