Skip to content

Catalog

CERULEo dataset catalog is a collection of RUL estimation datasets ready to use. All datasets are exposed as AbstractTimeSeriesDataset, enabling easy-to-use and transformation input pipelines. To get started see the guide and our list of datasets.

CMAPSS

CMAPSSDataset

Bases: AbstractPDMDataset

C-MAPSS Dataset

C-MAPSS stands for 'Commercial Modular Aero-Propulsion System Simulation' and it is a tool for the simulation of realistic large commercial turbofan engine data. Each flight is a combination of a series of flight conditions with a reasonable linear transition period to allow the engine to change from one flight condition to the next. The flight conditions are arranged to cover a typical ascent from sea level to 35K ft and descent back down to sea level.

The fault was injected at a given time in one of the flights and persists throughout the remaining flights, effectively increasing the age of the engine. The intent is to identify which flight and when in the flight the fault occurred.

Dataset reference

Available models are:

- FD001
- FD002
- FD003
- FD004
Example
train_dataset = CMAPSSDataset(train=True, models='FD001')
validation_dataset = CMAPSSDataset(train=False, models='FD001')

Parameters:

Name Type Description Default
train bool

Weather to obtain the train data provided, by default True

True
models Optional[Union[str, List[str]]]

Names of the models, by default None (all models)

None
Source code in ceruleo/dataset/catalog/CMAPSS.py
class CMAPSSDataset(AbstractPDMDataset):
    """C-MAPSS Dataset

    C-MAPSS stands for 'Commercial Modular Aero-Propulsion System Simulation' and it is a tool for the simulation 
    of realistic large commercial turbofan engine data. Each flight is a combination of a 
    series of flight conditions with a reasonable linear transition period to allow the 
    engine to change from one flight condition to the next. The flight conditions are arranged
    to cover a typical ascent from sea level to 35K ft and descent back down to sea level. 

    The fault was injected at a given time in one of the flights and persists throughout the 
    remaining flights, effectively increasing the age of the engine. The intent is to identify which 
    flight and when in the flight the fault occurred.

    [Dataset reference](https://data.nasa.gov/dataset/C-MAPSS-Aircraft-Engine-Simulator-Data/xaut-bemq)

    Available models are:

        - FD001
        - FD002
        - FD003
        - FD004


    Example:
        ```
        train_dataset = CMAPSSDataset(train=True, models='FD001')
        validation_dataset = CMAPSSDataset(train=False, models='FD001')
        ```

    Parameters:
        train: Weather to obtain the train data provided, by default True
        models: Names of the models, by default None (all models)
    """
    def __init__(
        self, train: bool = True, models: Optional[Union[str, List[str]]] = None
    ):
        super().__init__()
        obtain_raw_files(DATASET_PATH)
        if models is not None and isinstance(models, str):
            models = [models]
        self._validate_model_names(models)
        if train:
            processing_fun = process_file_train
        else:
            processing_fun = process_file_test
        self.lives = []

        for engine in engines:
            if models is not None and engine not in models:
                continue
            for _, g in processing_fun(engine).groupby("UnitNumber"):
                g.drop(columns=["UnitNumber"], inplace=True)
                g["Engine"] = engine
                self.lives.append(g)

    def _validate_model_names(self, models):
        if models is not None:
            for model in models:
                if model not in operation_mode:
                    raise ValueError(
                        f"Invalid model: valid model are {list(operation_mode.keys())}"
                    )

    def get_time_series(self, i):
        return self.lives[i]

    @property
    def n_time_series(self):
        return len(self.lives)

    @property
    def rul_column(self) -> str:
        return "RUL"

obtain_raw_files(raw_data_path=DATASET_PATH)

Download and unzip the raw files

Parameters:

Name Type Description Default
raw_data_path Path

Path where to store the dataset

DATASET_PATH
Source code in ceruleo/dataset/catalog/CMAPSS.py
def obtain_raw_files(raw_data_path: Path = DATASET_PATH, ):
    """Download and unzip the raw files

    Parameters:
        raw_data_path: Path where to store the dataset
    """
    raw_data_path = raw_data_path / "files"
    logger.info("Dataset not processed.")
    if not raw_data_path.is_dir():
        raw_data_path.mkdir(exist_ok=True, parents=True)
        ZIP_FILE = raw_data_path / "CMAPSSData.zip"
        if not ZIP_FILE.is_file():
            logger.info('Downloading file')            
            download(URL, ZIP_FILE)
        logger.info("Unzipping")
        with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref:
            zip_ref.extractall(raw_data_path)
        logger.info("Removing zip file")
        ZIP_FILE.unlink()

CMAPSS-2

CMAPSS2Dataset

Bases: AbstractPDMDataset

C-MAPSS-2 Dataset

The dataset provides a new realistic dataset of run-to-failure trajectories for a small fleet of aircraft engines under realistic flight conditions.

The damage propagation modelling used for the generation of this synthetic dataset builds on the modeling strategy from previous work . The dataset was generated with the Commercial Modular Aero-Propulsion System Simulation (C-MAPSS) dynamical model. The data set is been provided by the Prognostics CoE at NASA Ames in collaboration with ETH Zurich and PARC.

Dataset reference

Parameters:

Name Type Description Default
train Optional[bool]

Wether to obtain the train data provided

None
Source code in ceruleo/dataset/catalog/CMAPSS2.py
class CMAPSS2Dataset(AbstractPDMDataset):
    """C-MAPSS-2 Dataset

    The dataset provides a new realistic dataset of run-to-failure trajectories for a small fleet of aircraft
    engines under realistic flight conditions.

    The damage propagation modelling used for the generation of this synthetic dataset builds on
    the modeling strategy from previous work .
    The dataset was generated with the Commercial Modular Aero-Propulsion System Simulation (C-MAPSS) dynamical model.
    The data set is been provided by the Prognostics CoE at NASA Ames in collaboration with ETH Zurich and PARC.

    [Dataset reference](https://data.phmsociety.org/2021-phm-conference-data-challenge/)

    Parameters:
        train: Wether to obtain the train data provided
    """
    def __init__(
        self,
        path: Path = DATASET_PATH,
        train: Optional[bool] = None,
    ):
        super().__init__()
        self.path = path
        LIVES_TABLE_PATH = path / "lives_data.pkl"
        if not (LIVES_TABLE_PATH).is_file():
            pr = CMAPSS2PreProcessor()
            pr.run()
        with open(LIVES_TABLE_PATH, "rb") as file:
            self.lives = pickle.load(file)

        if train is not None:
            self.lives = self.lives[self.lives["Train"] == train]

    def get_time_series(self, i):
        df_path = self.lives.iloc[i]["Output Dir"]
        df = pd.read_parquet(self.path / df_path)
        return df

    @property
    def n_time_series(self):
        return len(self.lives)

    @property
    def rul_column(self) -> str:
        return "RUL"

PHMDataset2018

FailureType

Bases: Enum

Failure types availables for the dataset.

Possible values are:

FailureType.FlowCoolPressureDroppedBelowLimit
FailureType.FlowcoolPressureTooHighCheckFlowcoolPump
FailureType.FlowcoolLeak

Source code in ceruleo/dataset/catalog/PHMDataset2018.py
class FailureType(Enum):
    """Failure types availables for the dataset.

    Possible values are:
    ```
    FailureType.FlowCoolPressureDroppedBelowLimit
    FailureType.FlowcoolPressureTooHighCheckFlowcoolPump
    FailureType.FlowcoolLeak
    ```
    """

    FlowCoolPressureDroppedBelowLimit = "FlowCool Pressure Dropped Below Limit"
    FlowcoolPressureTooHighCheckFlowcoolPump = (
        'Flowcool Pressure Too High Check Flowcool Pump'
    )
    FlowcoolLeak = "Flowcool leak"
    FlowcoolPressureTooHighCheckFlowcoolPumpNoWaferID = 'Flowcool Pressure Too High Check Flowcool Pump [NoWaferID]'


    @staticmethod
    def that_starth_with(s: str):
        for f in FailureType:
            if s.startswith(f.value):
                return f
        return None

PHMDataset2018

Bases: PDMDataset

PHM 2018 Dataset

The 2018 PHM dataset is a public dataset released by Seagate which contains the execution of 20 different ion milling machines. They distinguish three different failure causes and provide 22 features, including user-defined variables and sensors.

Three faults are present in the dataset

  • Fault mode 1 occurs when flow-cool pressure drops.
  • Fault mode 2 occurs when flow-cool pressure becomes too high.
  • Fault mode 3 represents flow-cool leakage.

Dataset reference

Example:

dataset = PHMDataset2018(
    failure_types=FailureType.FlowCoolPressureDroppedBelowLimit,
    tools=['01_M02']
)

Parameters:

Name Type Description Default
path Path

Path where the dataset is located

DATA_PATH
Source code in ceruleo/dataset/catalog/PHMDataset2018.py
class PHMDataset2018(PDMDataset):
    """PHM 2018 Dataset

    The 2018 PHM dataset is a public dataset released by Seagate which contains the execution of 20 different
    ion milling machines. They distinguish three different failure causes and provide 22 features,
    including user-defined variables and sensors.

    Three faults are present in the dataset

    * Fault mode 1 occurs when flow-cool pressure drops.
    * Fault mode 2 occurs when flow-cool pressure becomes too high.
    * Fault mode 3 represents flow-cool leakage.

    [Dataset reference](https://phmsociety.org/conference/annual-conference-of-the-phm-society/annual-conference-of-the-prognostics-and-health-management-society-2018-b/phm-data-challenge-6/)

    Example:

    ```py
    dataset = PHMDataset2018(
        failure_types=FailureType.FlowCoolPressureDroppedBelowLimit,
        tools=['01_M02']
    )
    ```



    Parameters:
        path: Path where the dataset is located
    """

    failure_types: Optional[List[FailureType]]
    tools: Optional[List[str]]

    def __init__(
        self,
        path: Path = DATA_PATH,
        url: str = URL,
        failure_types: Optional[Union[FailureType, List[FailureType]]] = None,
        tools: Optional[Union[str, List[str]]] = None,
    ):
        self.url = url
        super().__init__(path / "phm_data_challenge_2018", "RUL")
        self._prepare_dataset()
        self.failure_types = failure_types
        self.tools = tools

        if self.failure_types is not None:
            if not isinstance(self.failure_types, list):
                self.failure_types = [failure_types]
            self.cycles_metadata = self.cycles_metadata[
                self.cycles_metadata["Fault name"].isin(
                    [f.value for f in self.failure_types]
                )
            ]


        if self.tools is not None:
            if not isinstance(self.tools, list):
                self.tools = [tools]
            self.cycles_metadata = self.cycles_metadata[
                self.cycles_metadata["Tool"].isin(self.tools)
            ]

    def _prepare_dataset(self):
        if self.cycles_table_filename.is_file():
            return
        if not (self.dataset_path / "raw" / "train").is_dir():
            self.prepare_raw_dataset()
        files = list(Path(self.dataset_path / "raw" / "train").resolve().glob("*.csv"))
        faults_files = list(
            Path(self.dataset_path / "raw" / "train" / "train_faults")
            .resolve()
            .glob("*.csv")
        )

        def get_key_from_filename(filename: str) -> str:
            return "_".join(filename.split("_")[0:2])

        fault_files_map = {get_key_from_filename(f.name): f for f in faults_files}
        data_fault_pairs = [
            (file, fault_files_map[get_key_from_filename(file.name)]) for file in files
        ]

        (
            DatasetBuilder()
            .set_splitting_method(
                FailureDataCycleSplitter(
                    data_time_column="time", fault_time_column="time"
                )
            )
            .set_rul_column_method(NumberOfRowsRULColumn())
            .set_output_mode(
                LocalStorageOutputMode(
                    self.dataset_path, output_format=DatasetFormat.PARQUET
                ).set_metadata_columns(
                    {"Tool": "Tool_data", "Fault name": "fault_name"}
                )
            )
            .set_index_column("time")
            .prepare_from_data_fault_pairs_files(
                data_fault_pairs,
            )
        )

    def prepare_raw_dataset(self):
        """Download and unzip the raw files

        Args:
            path (Path): Path where to store the raw dataset
        """

        def track_progress(members):
            for member in tqdm(members, total=70):
                yield member

        path = self.dataset_path / "raw"
        path.mkdir(parents=True, exist_ok=True)
        if not (path / OUTPUT).resolve().is_file():
            download(self.url, path)
        logger.info("Decompressing  dataset...")
        with tarfile.open(path / OUTPUT, "r") as tarball:

            def is_within_directory(directory, target):
                abs_directory = os.path.abspath(directory)
                abs_target = os.path.abspath(target)
                prefix = os.path.commonprefix([abs_directory, abs_target])
                return prefix == abs_directory

            def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
                for member in tar.getmembers():
                    member_path = os.path.join(path, member.name)
                    if not is_within_directory(path, member_path):
                        raise Exception("Attempted Path Traversal in Tar File")

                tar.extractall(path, members, numeric_owner=numeric_owner)

            safe_extract(tarball, path=path, members=track_progress(tarball))
        shutil.move(
            str(path / "phm_data_challenge_2018" / "train"), str(path / "train")
        )
        shutil.move(str(path / "phm_data_challenge_2018" / "test"), str(path / "test"))
        shutil.rmtree(str(path / "phm_data_challenge_2018"))
        (path / OUTPUT).unlink()

prepare_raw_dataset()

Download and unzip the raw files

Parameters:

Name Type Description Default
path Path

Path where to store the raw dataset

required
Source code in ceruleo/dataset/catalog/PHMDataset2018.py
def prepare_raw_dataset(self):
    """Download and unzip the raw files

    Args:
        path (Path): Path where to store the raw dataset
    """

    def track_progress(members):
        for member in tqdm(members, total=70):
            yield member

    path = self.dataset_path / "raw"
    path.mkdir(parents=True, exist_ok=True)
    if not (path / OUTPUT).resolve().is_file():
        download(self.url, path)
    logger.info("Decompressing  dataset...")
    with tarfile.open(path / OUTPUT, "r") as tarball:

        def is_within_directory(directory, target):
            abs_directory = os.path.abspath(directory)
            abs_target = os.path.abspath(target)
            prefix = os.path.commonprefix([abs_directory, abs_target])
            return prefix == abs_directory

        def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
            for member in tar.getmembers():
                member_path = os.path.join(path, member.name)
                if not is_within_directory(path, member_path):
                    raise Exception("Attempted Path Traversal in Tar File")

            tar.extractall(path, members, numeric_owner=numeric_owner)

        safe_extract(tarball, path=path, members=track_progress(tarball))
    shutil.move(
        str(path / "phm_data_challenge_2018" / "train"), str(path / "train")
    )
    shutil.move(str(path / "phm_data_challenge_2018" / "test"), str(path / "test"))
    shutil.rmtree(str(path / "phm_data_challenge_2018"))
    (path / OUTPUT).unlink()