Skip to content

Pairwise correlations

correlation_analysis(dataset, features=None)

Correlation Analysis Compute the correlation between all the features given an Iterable of executions.

Parameters:

Name Type Description Default
dataset AbstractPDMDataset

Dataset of time series

required
features Optional[List[str]]

List of features to consider when computing the correlations

None

Returns:

Type Description
CorrelationAnalysis

A CorrelationAnalysis object with map indexed by two colun names and the following information:s

  • Mean Correlation
  • Std Correlation
  • Percentage of lives with a high correlation
  • Abs mean correlation
  • Std mean correlation
  • Max correlation
  • Min correlation
Source code in ceruleo/dataset/analysis/correlation.py
def correlation_analysis(
    dataset: AbstractPDMDataset,
    features: Optional[List[str]] = None,
) -> CorrelationAnalysis:
    """
    Correlation Analysis
    Compute the correlation between all the features given an Iterable of executions.

    Parameters:
        dataset: Dataset of time series
        features: List of features to consider when computing the correlations

    Returns:
        A CorrelationAnalysis object with map indexed by two colun names and the following information:s

            - Mean Correlation
            - Std Correlation
            - Percentage of lives with a high correlation
            - Abs mean correlation
            - Std mean correlation
            - Max correlation
            - Min correlation

    """
    if features is None:
        features = sorted(list(dataset.common_features()))
    else:
        features = sorted(list(set(features).intersection(dataset.common_features())))
    features = dataset.get_features_of_life(0)[features].corr().columns
    correlated_features = []

    for ex in iterate_over_features(dataset):
        ex = ex[features]
        corr_m = ex.corr().fillna(0)

        correlated_features_for_execution = []

        for f1, f2 in combinations(features, 2):
            correlated_features_for_execution.append((f1, f2, corr_m.loc[f1, f2]))

        correlated_features.extend(correlated_features_for_execution)

    df = pd.DataFrame(correlated_features, columns=["Feature 1", "Feature 2", "Corr"])
    output = df.groupby(by=["Feature 1", "Feature 2"]).agg(
        {
            "Corr": [
                "mean",
                "std",
                "max",
                "min",
            ]
        }
    )

    # Calculate additional statistics
    output["Abs mean correlation"] = df.groupby(by=["Feature 1", "Feature 2"])[
        "Corr"
    ].apply(lambda x: x.abs().mean())
    output["Std abs mean correlation"] = df.groupby(by=["Feature 1", "Feature 2"])[
        "Corr"
    ].apply(lambda x: x.abs().std())

    output.columns = [
        "mean_correlation",
        "std_correlation",
        "max_correlation",
        "min_correlation",
        "abs_mean_correlation",
        "std_abs_mean_correlation",
    ]

    output = output.fillna(0)
    return CorrelationAnalysis(
        data={
            (k[0], k[1]): CorrelationAnalysisElement(
                mean_correlation=v["mean_correlation"],
                std_correlation=v["std_correlation"],
                max_correlation=v["max_correlation"],
                min_correlation=v["min_correlation"],
                abs_mean_correlation=v["abs_mean_correlation"],
                std_abs_mean_correlation=v["std_abs_mean_correlation"],
            )
            for k, v in output.iterrows()
        }
    )