In [1]:
Copied!
%load_ext autoreload
%autoreload 2
%load_ext autoreload
%autoreload 2
Notebook: Scikit-learn Models¶
In [2]:
Copied!
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set()
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set()
Load the dataset¶
In [3]:
Copied!
from ceruleo.dataset.catalog.CMAPSS import CMAPSSDataset
from ceruleo.dataset.catalog.CMAPSS import CMAPSSDataset
2024-02-24 22:44:16.622494: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2024-02-24 22:44:16.624330: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. 2024-02-24 22:44:16.651060: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered 2024-02-24 22:44:16.651085: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered 2024-02-24 22:44:16.652011: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2024-02-24 22:44:16.656662: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. 2024-02-24 22:44:16.657264: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-02-24 22:44:17.314001: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT /home/luciano/venvs/ceruleo/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
In [4]:
Copied!
train_dataset = CMAPSSDataset(train=True, models='FD001')
test_dataset = CMAPSSDataset(train=False, models='FD001')[15:30]
train_dataset = CMAPSSDataset(train=True, models='FD001')
test_dataset = CMAPSSDataset(train=False, models='FD001')[15:30]
Create a transformer for a dataset¶
In [5]:
Copied!
from ceruleo.transformation.functional.transformers import Transformer
from ceruleo.transformation.features.selection import ByNameFeatureSelector
from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline
from ceruleo.transformation.features.scalers import MinMaxScaler
from ceruleo.dataset.catalog.CMAPSS import sensor_indices
FEATURES = [train_dataset[0].columns[i] for i in sensor_indices]
from ceruleo.transformation.functional.transformers import Transformer
from ceruleo.transformation.features.selection import ByNameFeatureSelector
from ceruleo.transformation.functional.pipeline.pipeline import make_pipeline
from ceruleo.transformation.features.scalers import MinMaxScaler
from ceruleo.dataset.catalog.CMAPSS import sensor_indices
FEATURES = [train_dataset[0].columns[i] for i in sensor_indices]
In [6]:
Copied!
transformer = Transformer(
pipelineX=make_pipeline(
ByNameFeatureSelector(features=FEATURES),
MinMaxScaler(range=(-1, 1))
),
pipelineY=make_pipeline(
ByNameFeatureSelector(features=['RUL']),
)
)
transformer = Transformer(
pipelineX=make_pipeline(
ByNameFeatureSelector(features=FEATURES),
MinMaxScaler(range=(-1, 1))
),
pipelineY=make_pipeline(
ByNameFeatureSelector(features=['RUL']),
)
)
Split train-test-validation¶
In [7]:
Copied!
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
In [8]:
Copied!
train_dataset, val_dataset = train_test_split(train_dataset, train_size=0.9)
train_dataset, val_dataset = train_test_split(train_dataset, train_size=0.9)
In [9]:
Copied!
len(train_dataset), len(val_dataset), len(test_dataset)
len(train_dataset), len(val_dataset), len(test_dataset)
Out[9]:
(90, 10, 15)
Models¶
Scikit-learn¶
In [10]:
Copied!
import sklearn.pipeline as sk_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from ceruleo.models.sklearn import EstimatorWrapper, TimeSeriesWindowTransformer, CeruleoRegressor
from sklearn.linear_model import Ridge
import sklearn.pipeline as sk_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from ceruleo.models.sklearn import EstimatorWrapper, TimeSeriesWindowTransformer, CeruleoRegressor
from sklearn.linear_model import Ridge
In [11]:
Copied!
regressor = CeruleoRegressor(
TimeSeriesWindowTransformer(
transformer,
window_size=32,
padding=True,
step=1),
Ridge(alpha=15))
regressor.fit(train_dataset)
regressor = CeruleoRegressor(
TimeSeriesWindowTransformer(
transformer,
window_size=32,
padding=True,
step=1),
Ridge(alpha=15))
regressor.fit(train_dataset)
Out[11]:
CeruleoRegressor(regressor=Ridge(alpha=15), ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9eea07f2d0>, window_size=32))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CeruleoRegressor(regressor=Ridge(alpha=15), ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9eea07f2d0>, window_size=32))
Ridge(alpha=15)
Ridge(alpha=15)
TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9eea07f2d0>, window_size=32)
{'features': None, 'pipelineX': [('ByNameFeatureSelector', ['SensorMeasure2', 'SensorMeasure3', 'SensorMeasure4', 'SensorMeasure7', 'SensorMeasure8', 'SensorMeasure9', 'SensorMeasure11', 'SensorMeasure12', 'SensorMeasure13', 'SensorMeasure14', 'SensorMeasure15', 'SensorMeasure17', 'SensorMeasure20', 'SensorMeasure21']), ('MinMaxScaler', {'Min': SensorMeasure2 641.2100 SensorMeasure3 1571.0400 SensorMeasure4 1382.2500 SensorMeasure7 549.8500 SensorMeasure8 2387.9000 SensorMeasure9 9021.7300 SensorMeasure11 46.8500 SensorMeasure12 518.8300 SensorMeasure13 2387.8800 SensorMeasure14 8099.9400 SensorMeasure15 8.3249 SensorMeasure17 388.0000 SensorMeasure20 38.1400 SensorMeasure21 22.8942 dtype: float64, 'Max': SensorMeasure2 644.5300 SensorMeasure3 1616.9100 SensorMeasure4 1441.4900 SensorMeasure7 556.0600 SensorMeasure8 2388.5600 SensorMeasure9 9244.5900 SensorMeasure11 48.5300 SensorMeasure12 523.3800 SensorMeasure13 2388.5600 SensorMeasure14 8293.7200 SensorMeasure15 8.5678 SensorMeasure17 400.0000 SensorMeasure20 39.4300 SensorMeasure21 23.6184 dtype: float64})], 'pipelineY': [('ByNameFeatureSelector', ['RUL'])]}
Pipeline(final_step=MinMaxScaler(name='MinMaxScaler', range=(-1, 1)))
MinMaxScaler
MinMaxScaler
Pipeline(final_step=ByNameFeatureSelector(features=['RUL'], name='ByNameFeatureSelector'))
ByNameFeatureSelector : [RUL]
ByNameFeatureSelector : [RUL]
Val dataset results¶
In [12]:
Copied!
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.predict(val_dataset))
ax.plot(regressor.ts_window_transformer.true_values(val_dataset))
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.predict(val_dataset))
ax.plot(regressor.ts_window_transformer.true_values(val_dataset))
Out[12]:
[<matplotlib.lines.Line2D at 0x7d9ee8207310>]
Test dataset results¶
In [13]:
Copied!
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.predict(test_dataset))
ax.plot(regressor.ts_window_transformer.true_values(test_dataset))
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.predict(test_dataset))
ax.plot(regressor.ts_window_transformer.true_values(test_dataset))
Out[13]:
[<matplotlib.lines.Line2D at 0x7d9ee7f39810>]
Parameters grid search¶
In [14]:
Copied!
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from ceruleo.models.sklearn import CeruleoMetricWrapper
transformer = Transformer(
pipelineX=make_pipeline(
ByNameFeatureSelector(features=FEATURES),
MinMaxScaler(range=(-1, 1))
),
pipelineY=make_pipeline(
ByNameFeatureSelector(features=['RUL']),
)
)
regressor_gs = CeruleoRegressor(
TimeSeriesWindowTransformer(
transformer,
window_size=32,
padding=True,
step=1),
Ridge(alpha=15)
)
grid_search = GridSearchCV(
estimator=regressor_gs,
param_grid={
'ts_window_transformer__window_size': [5, 10],
'regressor': [Ridge(alpha=15), RandomForestRegressor(max_depth=5)]
},
scoring=CeruleoMetricWrapper('neg_mean_absolute_error'),
cv=2
)
grid_search.fit(train_dataset)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from ceruleo.models.sklearn import CeruleoMetricWrapper
transformer = Transformer(
pipelineX=make_pipeline(
ByNameFeatureSelector(features=FEATURES),
MinMaxScaler(range=(-1, 1))
),
pipelineY=make_pipeline(
ByNameFeatureSelector(features=['RUL']),
)
)
regressor_gs = CeruleoRegressor(
TimeSeriesWindowTransformer(
transformer,
window_size=32,
padding=True,
step=1),
Ridge(alpha=15)
)
grid_search = GridSearchCV(
estimator=regressor_gs,
param_grid={
'ts_window_transformer__window_size': [5, 10],
'regressor': [Ridge(alpha=15), RandomForestRegressor(max_depth=5)]
},
scoring=CeruleoMetricWrapper('neg_mean_absolute_error'),
cv=2
)
grid_search.fit(train_dataset)
Out[14]:
GridSearchCV(cv=2, estimator=CeruleoRegressor(regressor=Ridge(alpha=15), ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9ee81eac90>, window_size=32)), param_grid={'regressor': [Ridge(alpha=15), RandomForestRegressor(max_depth=5)], 'ts_window_transformer__window_size': [5, 10]}, scoring=<ceruleo.models.sklearn.CeruleoMetricWrapper object at 0x7d9eea3868d0>)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=2, estimator=CeruleoRegressor(regressor=Ridge(alpha=15), ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9ee81eac90>, window_size=32)), param_grid={'regressor': [Ridge(alpha=15), RandomForestRegressor(max_depth=5)], 'ts_window_transformer__window_size': [5, 10]}, scoring=<ceruleo.models.sklearn.CeruleoMetricWrapper object at 0x7d9eea3868d0>)
CeruleoRegressor(regressor=Ridge(alpha=15), ts_window_transformer=TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9ee81eac90>, window_size=32))
Ridge(alpha=15)
Ridge(alpha=15)
TimeSeriesWindowTransformer(transformer=<ceruleo.transformation.functional.transformers.Transformer object at 0x7d9ee81eac90>, window_size=32)
{'features': None, 'pipelineX': [('ByNameFeatureSelector', []), ('MinMaxScaler', {'Min': None, 'Max': None})], 'pipelineY': [('ByNameFeatureSelector', [])]}
Pipeline(final_step=MinMaxScaler(name='MinMaxScaler', range=(-1, 1)))
MinMaxScaler
MinMaxScaler
Pipeline(final_step=ByNameFeatureSelector(features=['RUL'], name='ByNameFeatureSelector'))
ByNameFeatureSelector : []
ByNameFeatureSelector : []
In [15]:
Copied!
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.ts_window_transformer.true_values(test_dataset), label='True values')
ax.plot(regressor.predict(test_dataset), label='Previous estimator')
ax.plot(grid_search.best_estimator_.predict(test_dataset), label='Best estimator')
ax.legend()
fig, ax = plt.subplots(figsize=(17, 5))
ax.plot(regressor.ts_window_transformer.true_values(test_dataset), label='True values')
ax.plot(regressor.predict(test_dataset), label='Previous estimator')
ax.plot(grid_search.best_estimator_.predict(test_dataset), label='Best estimator')
ax.legend()
Out[15]:
<matplotlib.legend.Legend at 0x7d9ee7f3b190>