Skip to content

Feature distribution

features_divergeces(ds, number_of_bins=15, columns=None, show_progress=False)

Compute the divergence between features

Parameters:

Name Type Description Default
ds AbstractPDMDataset

The dataset

required
number_of_bins int

Number of bins

15
columns Optional[List[str]]

Which columns to use

None

Returns:

Type Description
DataFrame

A DataFrame in which each row contains the distances between a feature of two run-to-failure cycle with the following columns:

  • Cycle 1: Run-to-failure cycle 1
  • Cycle 2: Run-to-failure cycle 2
  • Wasserstein: Wasserstein
  • KL: KL Divergence
  • feature: The feature name
Source code in ceruleo/dataset/analysis/distribution.py
def features_divergeces(
    ds: AbstractPDMDataset,
    number_of_bins: int = 15,
    columns: Optional[List[str]] = None,
    show_progress: bool = False,
) -> pd.DataFrame:
    """
    Compute the divergence between features

    Parameters:
        ds: The dataset
        number_of_bins: Number of bins
        columns: Which columns to use

    Returns:
        A DataFrame in which each row contains the distances between a feature of two run-to-failure cycle with the following columns:

            - Cycle 1: Run-to-failure cycle 1
            - Cycle 2: Run-to-failure cycle 2
            - Wasserstein: Wasserstein
            - KL: KL Divergence
            - feature: The feature name
    """
    if columns is None:
        columns = ds.numeric_features()

    features_bins = {}
    iterator = tqdm(columns) if show_progress else columns

    for feature in iterator:
        features_bins[feature] = compute_bins(ds, feature, number_of_bins)

    histograms = {}
    for life in iterate_over_features(ds):
        for feature in columns:
            if feature not in histograms:
                histograms[feature] = []
            histograms[feature].append(
                histogram_per_cycle(life, feature, features_bins[feature])
            )

    df_data = []
    for feature in columns:
        data = {}
        for (i, h1), (j, h2) in itertools.combinations(
            enumerate(histograms[feature]), 2
        ):
            kl = (np.mean(kl_div(h1, h2)) + np.mean(kl_div(h2, h1))) / 2
            wd = wasserstein_distance(h1, h2)
            df_data.append(
                (
                    i,
                    j,
                    ds.get_features_of_life(i).shape[0],
                    ds.get_features_of_life(j).shape[0],
                    abs(ds.get_features_of_life(i).shape[0]-ds.get_features_of_life(j).shape[0]),
                    wd,
                    kl,
                    feature,
                )
            )
    df = pd.DataFrame(
        df_data,
        columns=[
            "Cycle 1",
            "Cycle 2",
            "Cycle 1 length",
            "Cycle 2 length",
            "Abs Length difference",           
            "Wasserstein",
            "KL",
            "feature",
        ],
    )

    return df

histogram_per_cycle(cycle, feature, bins_to_use, normalize=True)

Compute the histogram of a feature in a run-to-failure cycle

Parameters:

Name Type Description Default
cycle DataFrame

The run-to-failure cycle

required
feature str

The feature to compute the histogram

required
bins_to_use ndarray

Number of bins to use

required
normalize bool

Wheter to normalize the histogram. Defaults to True.

True

Returns:

Type Description
List[ndarray]

List[np.ndarray]: The histogram of the feature

Source code in ceruleo/dataset/analysis/distribution.py
def histogram_per_cycle(
    cycle: pd.DataFrame,
    feature: str,
    bins_to_use: np.ndarray,
    normalize: bool = True,
) -> List[np.ndarray]:
    """Compute the histogram of a feature in a run-to-failure cycle

    Args:
        cycle (pd.DataFrame): The run-to-failure cycle
        feature (str): The  feature to compute the histogram
        bins_to_use (np.ndarray): Number of bins to use
        normalize (bool, optional): Wheter to normalize the histogram. Defaults to True.

    Returns:
        List[np.ndarray]: The histogram of the feature
    """
    try:
        d = cycle[feature]
        h, _ = np.histogram(d, bins=bins_to_use)

        if normalize:
            h = h / np.sum(h)
            h += 1e-50
        return h
    except Exception as e:
        logger.info(f"Error {e} when computing the distribution for feature {feature}")