In [1]:
Copied!
%load_ext autoreload
%autoreload 2
%load_ext autoreload
%autoreload 2
Notebook: Scikit-learn Models¶
In [2]:
Copied!
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set()
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set()
Load the dataset¶
In [3]:
Copied!
from ceruleo.dataset.catalog.CMAPSS import CMAPSSDataset
from ceruleo.dataset.catalog.CMAPSS import CMAPSSDataset
2024-02-24 22:44:16.622494: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2024-02-24 22:44:16.624330: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. 2024-02-24 22:44:16.651060: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered 2024-02-24 22:44:16.651085: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered 2024-02-24 22:44:16.652011: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2024-02-24 22:44:16.656662: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. 2024-02-24 22:44:16.657264: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-02-24 22:44:17.314001: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT /home/luciano/venvs/ceruleo/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
In [4]:
Copied!
train_dataset = CMAPSSDataset(train=True, models='FD001')
test_dataset = CMAPSSDataset(train=False, models='FD001')[15:30]
train_dataset = CMAPSSDataset(train=True, models='FD001')
test_dataset = CMAPSSDataset(train=False, models='FD001')[15:30]
Create a transformer for a dataset¶
In [5]:
Copied!
from ceruleo.transformation.functional.transformers import Transformer
from ceruleo.transformation.features.selection import ByNameFeatureSelector
from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline
from ceruleo.transformation.features.scalers import MinMaxScaler
from ceruleo.dataset.catalog.CMAPSS import sensor_indices
FEATURES = [train_dataset[0].columns[i] for i in sensor_indices]
from ceruleo.transformation.functional.transformers import Transformer
from ceruleo.transformation.features.selection import ByNameFeatureSelector
from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline
from ceruleo.transformation.features.scalers import MinMaxScaler
from ceruleo.dataset.catalog.CMAPSS import sensor_indices
FEATURES = [train_dataset[0].columns[i] for i in sensor_indices]
In [6]:
Copied!
transformer = Transformer(
pipelineX=make_pipeline(
ByNameFeatureSelector(features=FEATURES),
MinMaxScaler(range=(-1, 1))
),
pipelineY=make_pipeline(
ByNameFeatureSelector(features=['RUL']),
)
)
transformer = Transformer(
pipelineX=make_pipeline(
ByNameFeatureSelector(features=FEATURES),
MinMaxScaler(range=(-1, 1))
),
pipelineY=make_pipeline(
ByNameFeatureSelector(features=['RUL']),
)
)
Split train-test-validation¶
In [7]:
Copied!
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
In [8]:
Copied!
train_dataset, val_dataset = train_test_split(train_dataset, train_size=0.9)
train_dataset, val_dataset = train_test_split(train_dataset, train_size=0.9)
In [9]:
Copied!
len(train_dataset), len(val_dataset), len(test_dataset)
len(train_dataset), len(val_dataset), len(test_dataset)
Out[9]:
(90, 10, 15)
Models¶
Scikit-learn¶
In [10]:
Copied!
import sklearn.pipeline as sk_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from ceruleo.models.sklearn import EstimatorWrapper, TimeSeriesWindowTransformer, CeruleoRegressor
from sklearn.linear_model import Ridge
import sklearn.pipeline as sk_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from ceruleo.models.sklearn import EstimatorWrapper, TimeSeriesWindowTransformer, CeruleoRegressor
from sklearn.linear_model import Ridge
In [11]:
Copied!
regressor = CeruleoRegressor(
TimeSeriesWindowTransformer(
transformer,
window_size=32,
padding=True,
step=1),
Ridge(alpha=15))
regressor.fit(train_dataset)
regressor = CeruleoRegressor(
TimeSeriesWindowTransformer(
transformer,
window_size=32,
padding=True,
step=1),
Ridge(alpha=15))
regressor.fit(train_dataset)
Out[11]:
CeruleoRegressor(regressor=Ridge(alpha=15),
ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9eea07f2d0>,
window_size=32))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CeruleoRegressor(regressor=Ridge(alpha=15),
ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9eea07f2d0>,
window_size=32))Ridge(alpha=15)
Ridge(alpha=15)
TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9eea07f2d0>,
window_size=32){'features': None, 'pipelineX': [('ByNameFeatureSelector', ['SensorMeasure2', 'SensorMeasure3', 'SensorMeasure4', 'SensorMeasure7', 'SensorMeasure8', 'SensorMeasure9', 'SensorMeasure11', 'SensorMeasure12', 'SensorMeasure13', 'SensorMeasure14', 'SensorMeasure15', 'SensorMeasure17', 'SensorMeasure20', 'SensorMeasure21']), ('MinMaxScaler', {'Min': SensorMeasure2 641.2100
SensorMeasure3 1571.0400
SensorMeasure4 1382.2500
SensorMeasure7 549.8500
SensorMeasure8 2387.9000
SensorMeasure9 9021.7300
SensorMeasure11 46.8500
SensorMeasure12 518.8300
SensorMeasure13 2387.8800
SensorMeasure14 8099.9400
SensorMeasure15 8.3249
SensorMeasure17 388.0000
SensorMeasure20 38.1400
SensorMeasure21 22.8942
dtype: float64, 'Max': SensorMeasure2 644.5300
SensorMeasure3 1616.9100
SensorMeasure4 1441.4900
SensorMeasure7 556.0600
SensorMeasure8 2388.5600
SensorMeasure9 9244.5900
SensorMeasure11 48.5300
SensorMeasure12 523.3800
SensorMeasure13 2388.5600
SensorMeasure14 8293.7200
SensorMeasure15 8.5678
SensorMeasure17 400.0000
SensorMeasure20 39.4300
SensorMeasure21 23.6184
dtype: float64})], 'pipelineY': [('ByNameFeatureSelector', ['RUL'])]}Pipeline(final_step=MinMaxScaler(name='MinMaxScaler', range=(-1, 1)))
MinMaxScaler
MinMaxScaler
Pipeline(final_step=ByNameFeatureSelector(features=['RUL'],
name='ByNameFeatureSelector'))ByNameFeatureSelector : [RUL]
ByNameFeatureSelector : [RUL]
Val dataset results¶
In [12]:
Copied!
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.predict(val_dataset))
ax.plot(regressor.ts_window_transformer.true_values(val_dataset))
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.predict(val_dataset))
ax.plot(regressor.ts_window_transformer.true_values(val_dataset))
Out[12]:
[<matplotlib.lines.Line2D at 0x7d9ee8207310>]
Test dataset results¶
In [13]:
Copied!
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.predict(test_dataset))
ax.plot(regressor.ts_window_transformer.true_values(test_dataset))
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.predict(test_dataset))
ax.plot(regressor.ts_window_transformer.true_values(test_dataset))
Out[13]:
[<matplotlib.lines.Line2D at 0x7d9ee7f39810>]
Parameters grid search¶
In [14]:
Copied!
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from ceruleo.models.sklearn import CeruleoMetricWrapper
transformer = Transformer(
pipelineX=make_pipeline(
ByNameFeatureSelector(features=FEATURES),
MinMaxScaler(range=(-1, 1))
),
pipelineY=make_pipeline(
ByNameFeatureSelector(features=['RUL']),
)
)
regressor_gs = CeruleoRegressor(
TimeSeriesWindowTransformer(
transformer,
window_size=32,
padding=True,
step=1),
Ridge(alpha=15)
)
grid_search = GridSearchCV(
estimator=regressor_gs,
param_grid={
'ts_window_transformer__window_size': [5, 10],
'regressor': [Ridge(alpha=15), RandomForestRegressor(max_depth=5)]
},
scoring=CeruleoMetricWrapper('neg_mean_absolute_error'),
cv=2
)
grid_search.fit(train_dataset)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from ceruleo.models.sklearn import CeruleoMetricWrapper
transformer = Transformer(
pipelineX=make_pipeline(
ByNameFeatureSelector(features=FEATURES),
MinMaxScaler(range=(-1, 1))
),
pipelineY=make_pipeline(
ByNameFeatureSelector(features=['RUL']),
)
)
regressor_gs = CeruleoRegressor(
TimeSeriesWindowTransformer(
transformer,
window_size=32,
padding=True,
step=1),
Ridge(alpha=15)
)
grid_search = GridSearchCV(
estimator=regressor_gs,
param_grid={
'ts_window_transformer__window_size': [5, 10],
'regressor': [Ridge(alpha=15), RandomForestRegressor(max_depth=5)]
},
scoring=CeruleoMetricWrapper('neg_mean_absolute_error'),
cv=2
)
grid_search.fit(train_dataset)
Out[14]:
GridSearchCV(cv=2,
estimator=CeruleoRegressor(regressor=Ridge(alpha=15),
ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9ee81eac90>,
window_size=32)),
param_grid={'regressor': [Ridge(alpha=15),
RandomForestRegressor(max_depth=5)],
'ts_window_transformer__window_size': [5, 10]},
scoring=<ceruleo.models.sklearn.CeruleoMetricWrapper object at 0x7d9eea3868d0>)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=2,
estimator=CeruleoRegressor(regressor=Ridge(alpha=15),
ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9ee81eac90>,
window_size=32)),
param_grid={'regressor': [Ridge(alpha=15),
RandomForestRegressor(max_depth=5)],
'ts_window_transformer__window_size': [5, 10]},
scoring=<ceruleo.models.sklearn.CeruleoMetricWrapper object at 0x7d9eea3868d0>)CeruleoRegressor(regressor=Ridge(alpha=15),
ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9ee81eac90>,
window_size=32))Ridge(alpha=15)
Ridge(alpha=15)
TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9ee81eac90>,
window_size=32){'features': None, 'pipelineX': [('ByNameFeatureSelector', []), ('MinMaxScaler', {'Min': None, 'Max': None})], 'pipelineY': [('ByNameFeatureSelector', [])]}Pipeline(final_step=MinMaxScaler(name='MinMaxScaler', range=(-1, 1)))
MinMaxScaler
MinMaxScaler
Pipeline(final_step=ByNameFeatureSelector(features=['RUL'],
name='ByNameFeatureSelector'))ByNameFeatureSelector : []
ByNameFeatureSelector : []
In [15]:
Copied!
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.ts_window_transformer.true_values(test_dataset), label='True values')
ax.plot(regressor.predict(test_dataset), label='Previous estimator')
ax.plot(grid_search.best_estimator_.predict(test_dataset), label='Best estimator')
ax.legend()
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.ts_window_transformer.true_values(test_dataset), label='True values')
ax.plot(regressor.predict(test_dataset), label='Previous estimator')
ax.plot(grid_search.best_estimator_.predict(test_dataset), label='Best estimator')
ax.legend()
Out[15]:
<matplotlib.legend.Legend at 0x7d9ee7f3b190>