Skip to content

Builder

When building a PdM dataset you need the time series of the sensors of the machine and some indication of when the piece of equipment arrived to its end.

For this reason the DatasetBuilder class helps on this. It allow you to specify how your dataset is strucctured and split each run-to-failure cycle for posterior analysis.

Failure modes

Increasing feature

In scenarios where the dataset includes an increasing feature denoting the usage time of the item in question, it is possible to detect instances where a value at position 'i' is lower than the value at position 'i+1'. In such instances, we can establish that the item has been replaced. Consequently, we can determine the end of its lifespan as the last point within this increasing sequence.

Life identifier feature

In scenarios where the dataset includes a feature denoting each cycle ID, it is possible to detect samples for which the ID remains the same.

Cycle end identifier

In situations where a dataset contains a feature that indicats the end of a cycle, it is possible to segment data points based on this feature. Similar to detecting changes in ascending sequences, this process identifies transitions in the 'life end indicator' feature.

Data and fault modes

In scenearios where the dataset is composed by two separates files: one with the sensor data and another with the fault data, it is possible to use the data and fault modes to split the run to failure cycles by combining both sources. In that cases a datetime feature is required to align the data and fault modes.

Examples of usage

Increasing feature

df = pd.DataFrame(
    {
        "Cycle": list(range(0, 12, ))*2,
        "feature_a": list(range(12))*2,
        "feature_b": list(range(12, 24))*2,
    }
)
dataset = (
    DatasetBuilder()
    .set_splitting_method(IncreasingFeatureCycleSplitter("Cycle"))
    .set_rul_column_method(CycleRULColumn("Cycle"))
    .set_output_mode(InMemoryOutputMode())
    .build_from_df(df)
)

Increasing with datetime based RUL feature

df = pd.DataFrame(
    {
        "Cycle": list(range(0, 12, ))*2,
        "datetime": pd.date_range("2021-01-01", periods=24, freq="min").tolist(),
        "feature_a": list(range(12))*2,
        "feature_b": list(range(12, 24))*2,
    }
)
dataset = (
    DatasetBuilder()
    .set_splitting_method(IncreasingFeatureCycleSplitter("Cycle"))
    .set_rul_column_method(DatetimeRULColumn("datetime", "s"))
    .set_output_mode(InMemoryOutputMode())
    .build_from_df(df)
)

Life identifier feature

df = pd.DataFrame(
    {
        "life_id": [1]*12 + [2]*12,
        "datetime": pd.date_range("2021-01-01", periods=24, freq="min").tolist(),
        "feature_a": list(range(12))*2,
        "feature_b": list(range(12, 24))*2,
    }
)
dataset = (
    DatasetBuilder()
    .set_splitting_method(LifeIdCycleSplitter("life_id"))
    .set_rul_column_method(DatetimeRULColumn("datetime", "s"))
    .set_output_mode(InMemoryOutputMode())
    .build_from_df(df)
)

Life end indicator feature

df = pd.DataFrame(
    {
        "life_end": [0]*11 + [1] + [0]*11 + [1],
        "datetime": pd.date_range("2021-01-01", periods=24, freq="min").tolist(),
        "feature_a": list(range(12))*2,
        "feature_b": list(range(12, 24))*2,
    }
)
dataset = (
    DatasetBuilder()
    .set_splitting_method(LifeEndIndicatorCycleSplitter("life_end"))
    .set_rul_column_method(DatetimeRULColumn("datetime", "s"))
    .set_output_mode(InMemoryOutputMode())
    .build_from_df(df)
)

Data and fault modes

df = pd.DataFrame(
    {

        "datetime": pd.date_range("2021-01-01", periods=24, freq="min").tolist(),
        "feature_a": list(range(12))*2,
        "feature_b": list(range(12, 24))*2,
    }
)
failures = pd.DataFrame({
    "datetime_failure": [pd.Timestamp("2021-01-01 00:11:00"), pd.Timestamp("2021-01-01 00:23:00")],
    "failure_type": ["A", "B"]
})
dataset = (
    DatasetBuilder()
    .set_splitting_method(FailureDataCycleSplitter("datetime", "datetime_failure"))
    .set_rul_column_method(DatetimeRULColumn("datetime", "s"))
    .set_output_mode(InMemoryOutputMode())
    .build_from_df(df, failures)
)
"""

Reference

Dataset Builder

Cycles Splitter

FailureDataCycleSplitter

Bases: CyclesSplitter

A splitter that divides a DataFrame into cycles based on a separate DataFrame containing failure data.

Source code in ceruleo/dataset/builder/cycles_splitter.py
class FailureDataCycleSplitter(CyclesSplitter):
    """A splitter that divides a DataFrame into cycles based on a separate DataFrame containing failure data."""

    data_time_column: str
    fault_time_column: str

    def __init__(self, data_time_column: str, fault_time_column: str):
        self.data_time_column = data_time_column
        self.fault_time_column = fault_time_column

    def split(self, data: pd.DataFrame, fault: pd.DataFrame):
        data = self.merge_data_with_faults(data, fault)
        for life_index, life_data in data.groupby("fault_number"):
            if life_data.shape[0] == 0:
                continue
            yield life_data.copy()

    def merge_data_with_faults(self, data: pd.DataFrame, fault: pd.DataFrame):
        """Merge the raw sensor data with the fault information

        Parameters:

            data_file: Path where the raw sensor data is located
            fault_data_file: Path where the fault information is located

        Returns:

            df: Dataframe indexed by time with the raw sensors and faults
                The dataframe contains also a fault_number column
        """

        fault = fault.drop_duplicates(subset=[self.fault_time_column]).copy()
        fault["fault_number"] = range(fault.shape[0])
        return pd.merge_asof(
            data,
            fault,
            left_on=self.data_time_column,
            right_on=self.fault_time_column,

            suffixes=["_data", "_fault"],
            direction="forward",
        )

merge_data_with_faults(data, fault)

Merge the raw sensor data with the fault information

Parameters:

data_file: Path where the raw sensor data is located
fault_data_file: Path where the fault information is located

Returns:

df: Dataframe indexed by time with the raw sensors and faults
    The dataframe contains also a fault_number column
Source code in ceruleo/dataset/builder/cycles_splitter.py
def merge_data_with_faults(self, data: pd.DataFrame, fault: pd.DataFrame):
    """Merge the raw sensor data with the fault information

    Parameters:

        data_file: Path where the raw sensor data is located
        fault_data_file: Path where the fault information is located

    Returns:

        df: Dataframe indexed by time with the raw sensors and faults
            The dataframe contains also a fault_number column
    """

    fault = fault.drop_duplicates(subset=[self.fault_time_column]).copy()
    fault["fault_number"] = range(fault.shape[0])
    return pd.merge_asof(
        data,
        fault,
        left_on=self.data_time_column,
        right_on=self.fault_time_column,

        suffixes=["_data", "_fault"],
        direction="forward",
    )

IncreasingFeatureCycleSplitter

Bases: CyclesSplitter

A splitter that divides a DataFrame into cycles based on changes in the value of an increasing feature.

When the value of the increasing feature decreases, a new cycle is considered to start.

Source code in ceruleo/dataset/builder/cycles_splitter.py
class IncreasingFeatureCycleSplitter(CyclesSplitter):
    """
    A splitter that divides a DataFrame into cycles based on changes in the value of an increasing feature.

    When the value of the increasing feature decreases, a new cycle is considered to start.

    """

    def __init__(self, increasing_feature: str):
        """Initializes the splitter with the name of the increasing feature.

        Parameters
        ----------
        increasing_feature : str
            The name of the increasing feature used for splitting.
        """
        self.increasing_feature = increasing_feature

    def split(self, data: pd.DataFrame) -> Iterator[pd.DataFrame]:
        """Splits the input DataFrame into cycles based on changes in the increasing feature.

        Parameters
        ----------
        data : pd.DataFrame
            The input DataFrame to be split.

        Yields
        ------
        Iterator[pd.DataFrame]
            An iterator of DataFrames, each containing a cycle of the input data.
        """
        restart_points = data[data[self.increasing_feature].diff() < 0].index.tolist()
        start_idx = 0
        i = 1
        for restart_idx in restart_points:
            subset = data.iloc[start_idx:restart_idx]
            yield subset.copy()
            start_idx = restart_idx
            i += 1
        yield subset.copy()

__init__(increasing_feature)

Initializes the splitter with the name of the increasing feature.

Parameters

increasing_feature : str The name of the increasing feature used for splitting.

Source code in ceruleo/dataset/builder/cycles_splitter.py
def __init__(self, increasing_feature: str):
    """Initializes the splitter with the name of the increasing feature.

    Parameters
    ----------
    increasing_feature : str
        The name of the increasing feature used for splitting.
    """
    self.increasing_feature = increasing_feature

split(data)

Splits the input DataFrame into cycles based on changes in the increasing feature.

Parameters

data : pd.DataFrame The input DataFrame to be split.

Yields

Iterator[pd.DataFrame] An iterator of DataFrames, each containing a cycle of the input data.

Source code in ceruleo/dataset/builder/cycles_splitter.py
def split(self, data: pd.DataFrame) -> Iterator[pd.DataFrame]:
    """Splits the input DataFrame into cycles based on changes in the increasing feature.

    Parameters
    ----------
    data : pd.DataFrame
        The input DataFrame to be split.

    Yields
    ------
    Iterator[pd.DataFrame]
        An iterator of DataFrames, each containing a cycle of the input data.
    """
    restart_points = data[data[self.increasing_feature].diff() < 0].index.tolist()
    start_idx = 0
    i = 1
    for restart_idx in restart_points:
        subset = data.iloc[start_idx:restart_idx]
        yield subset.copy()
        start_idx = restart_idx
        i += 1
    yield subset.copy()

LifeEndIndicatorCycleSplitter

Bases: CyclesSplitter

A splitter that divides a DataFrame into cycles based on a life end indicator feature.

Source code in ceruleo/dataset/builder/cycles_splitter.py
class LifeEndIndicatorCycleSplitter(CyclesSplitter):
    """A splitter that divides a DataFrame into cycles based on a life end indicator feature."""

    def __init__(self, life_end_indicator_feature: str, end_value=1):
        """

        Parameters
        ----------
        life_end_indicator_feature : str
            The name of the column representing the life end indicator.
        end_value : int, optional
            The value indicating the end of a life cycle. by default 1

        """
        self.life_end_indicator_feature = life_end_indicator_feature
        self.end_value = end_value

    def split(self, data: pd.DataFrame) -> Iterator[pd.DataFrame]:
        """Splits the input DataFrame into cycles based on a life end indicator feature.

        Parameters
        ----------
        data : pd.DataFrame
            The input DataFrame to be split.

        Yields
        ------
        Iterator[pd.DataFrame]
            An iterator of DataFrames, each containing a cycle of the input data.
        """
        start_idx = 0
        for idx in data[
            data[self.life_end_indicator_feature] == self.end_value
        ].index.tolist():
            subset = data.iloc[start_idx : idx + 1]
            yield subset.copy()
            start_idx = idx + 1
        if start_idx < data.shape[0]:
            yield data.iloc[start_idx:].copy()

__init__(life_end_indicator_feature, end_value=1)

Parameters

life_end_indicator_feature : str The name of the column representing the life end indicator. end_value : int, optional The value indicating the end of a life cycle. by default 1

Source code in ceruleo/dataset/builder/cycles_splitter.py
def __init__(self, life_end_indicator_feature: str, end_value=1):
    """

    Parameters
    ----------
    life_end_indicator_feature : str
        The name of the column representing the life end indicator.
    end_value : int, optional
        The value indicating the end of a life cycle. by default 1

    """
    self.life_end_indicator_feature = life_end_indicator_feature
    self.end_value = end_value

split(data)

Splits the input DataFrame into cycles based on a life end indicator feature.

Parameters

data : pd.DataFrame The input DataFrame to be split.

Yields

Iterator[pd.DataFrame] An iterator of DataFrames, each containing a cycle of the input data.

Source code in ceruleo/dataset/builder/cycles_splitter.py
def split(self, data: pd.DataFrame) -> Iterator[pd.DataFrame]:
    """Splits the input DataFrame into cycles based on a life end indicator feature.

    Parameters
    ----------
    data : pd.DataFrame
        The input DataFrame to be split.

    Yields
    ------
    Iterator[pd.DataFrame]
        An iterator of DataFrames, each containing a cycle of the input data.
    """
    start_idx = 0
    for idx in data[
        data[self.life_end_indicator_feature] == self.end_value
    ].index.tolist():
        subset = data.iloc[start_idx : idx + 1]
        yield subset.copy()
        start_idx = idx + 1
    if start_idx < data.shape[0]:
        yield data.iloc[start_idx:].copy()

LifeIdCycleSplitter

Bases: CyclesSplitter

A splitter that divides a DataFrame into cycles based on unique life identifiers.

Source code in ceruleo/dataset/builder/cycles_splitter.py
class LifeIdCycleSplitter(CyclesSplitter):
    """A splitter that divides a DataFrame into cycles based on unique life identifiers."""

    def __init__(self, life_id_feature: str):
        """Initializes the splitter with the name of the life id feature.

        Parameters
        ----------
        life_id_feature : str
            The name of the column representing the life identifier.
        """
        self.life_id_feature = life_id_feature

    def split(self, data: pd.DataFrame) -> Iterator[pd.DataFrame]:
        """Splits the input DataFrame into cycles based on unique life identifiers.

        Parameters
        ----------
        data : pd.DataFrame
            The input DataFrame to be split.

        Yields
        ------
        Iterator[pd.DataFrame]
            An iterator of DataFrames, each containing a cycle of the input data.
        """
        for life_id in data[self.life_id_feature].unique():
            subset = data[data[self.life_id_feature] == life_id]
            yield subset.copy()

__init__(life_id_feature)

Initializes the splitter with the name of the life id feature.

Parameters

life_id_feature : str The name of the column representing the life identifier.

Source code in ceruleo/dataset/builder/cycles_splitter.py
def __init__(self, life_id_feature: str):
    """Initializes the splitter with the name of the life id feature.

    Parameters
    ----------
    life_id_feature : str
        The name of the column representing the life identifier.
    """
    self.life_id_feature = life_id_feature

split(data)

Splits the input DataFrame into cycles based on unique life identifiers.

Parameters

data : pd.DataFrame The input DataFrame to be split.

Yields

Iterator[pd.DataFrame] An iterator of DataFrames, each containing a cycle of the input data.

Source code in ceruleo/dataset/builder/cycles_splitter.py
def split(self, data: pd.DataFrame) -> Iterator[pd.DataFrame]:
    """Splits the input DataFrame into cycles based on unique life identifiers.

    Parameters
    ----------
    data : pd.DataFrame
        The input DataFrame to be split.

    Yields
    ------
    Iterator[pd.DataFrame]
        An iterator of DataFrames, each containing a cycle of the input data.
    """
    for life_id in data[self.life_id_feature].unique():
        subset = data[data[self.life_id_feature] == life_id]
        yield subset.copy()