Outliers

`BeyondQuartileOutlierRemover`

Bases: TransformerStep

Remove values outside (Q1, Q3)

If clip is True the values will be clipped between the range, otherwise the values are going to be replaced by inf and -inf

Parameters:

Name	Type	Description	Default
`lower_quantile`	`float`	Lower quantile threshold for the non-anomalous values, by default 0.25	`0.25`
`upper_quantile`	`float`	Upper quantile threshold for the non-anomalous values, by default 0.75	`0.75`
`clip`	`bool`	Wether to clip the values outside the range, by default False	`False`
`name`	`Optional[str]`	Name of the step, by default None	`None`

Source code in ceruleo/transformation/features/outliers.py

class BeyondQuartileOutlierRemover(TransformerStep):
    """
    Remove values outside (Q1, Q3)

    If clip is True the values will be clipped between the range,
    otherwise the values are going to be replaced by inf and -inf

    Parameters:
        lower_quantile:  Lower quantile threshold for the non-anomalous values, by default 0.25
        upper_quantile: Upper quantile threshold for the non-anomalous values, by default 0.75
        clip: Wether to clip the values outside the range, by default False
        name: Name of the step, by default None
    """

    def __init__(
        self,
        lower_quantile: float = 0.25,
        upper_quantile: float = 0.75,
        subsample: float = 1.0,
        clip: bool = False,
        name: Optional[str] = None,
        prefer_partial_fit:bool=False
    ):

        super().__init__(name=name, prefer_partial_fit=prefer_partial_fit)
        self.tdigest_dict = None
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.clip = clip
        self.subsample= subsample
        self.Q1 = None
        self.Q3 = None
        self.quantile_estimator = None

    def partial_fit(self, X: pd.DataFrame):
        """
        Compute the quantiles of the data incrementally

        Parameters:
            X: Input life
        """
        if X.shape[0] == 1:
            return self
        if self.quantile_estimator is None:
            self.quantile_estimator = QuantileEstimator(
               tdigest_size=100, subsample=self.subsample
            )

        self.quantile_estimator.update(X.select_dtypes(include="number"))
        return self

    def fit(self, X: pd.DataFrame):
        """
        Compute the quantiles of the data

        Parameters:
            X: Input life
        """
        if self.subsample < 1:

            sampled_points = np.random.choice(
                X.shape[0], int(X.shape[0] * self.subsample), replace=False
            )
            X = X.iloc[sampled_points, :]
        self.Q1 = X.quantile(self.lower_quantile)
        self.Q3 = X.quantile(self.upper_quantile)

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """ 
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """

        if self.Q1 is None:
            self.Q1 = self.quantile_estimator.estimate_quantile(self.lower_quantile)
            self.Q3 = self.quantile_estimator.estimate_quantile(self.upper_quantile)

        new_X = X.copy()


        if self.clip:
            new_X.clip(lower=self.Q1, upper=self.Q3, inplace=True, axis=1)
        else:
            new_X[new_X < self.Q1] = -np.inf
            new_X[new_X > self.Q3] = np.inf            
        return new_X

    def description(self):
        name = super().description()
        data = []
        for k in self.Q1.keys():
            data.append((k, {"Q1": self.Q1[k], "Q3": self.Q3[k], "IQR": self.IQR[k]}))
        return (name, data)

`fit(X)`

Compute the quantiles of the data

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Source code in ceruleo/transformation/features/outliers.py

def fit(self, X: pd.DataFrame):
    """
    Compute the quantiles of the data

    Parameters:
        X: Input life
    """
    if self.subsample < 1:

        sampled_points = np.random.choice(
            X.shape[0], int(X.shape[0] * self.subsample), replace=False
        )
        X = X.iloc[sampled_points, :]
    self.Q1 = X.quantile(self.lower_quantile)
    self.Q3 = X.quantile(self.upper_quantile)

    return self

`partial_fit(X)`

Compute the quantiles of the data incrementally

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Source code in ceruleo/transformation/features/outliers.py

def partial_fit(self, X: pd.DataFrame):
    """
    Compute the quantiles of the data incrementally

    Parameters:
        X: Input life
    """
    if X.shape[0] == 1:
        return self
    if self.quantile_estimator is None:
        self.quantile_estimator = QuantileEstimator(
           tdigest_size=100, subsample=self.subsample
        )

    self.quantile_estimator.update(X.select_dtypes(include="number"))
    return self

`transform(X)`

Remove the outliers from the input life.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Returns:

Type	Description
`DataFrame`	A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """ 
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """

    if self.Q1 is None:
        self.Q1 = self.quantile_estimator.estimate_quantile(self.lower_quantile)
        self.Q3 = self.quantile_estimator.estimate_quantile(self.upper_quantile)

    new_X = X.copy()


    if self.clip:
        new_X.clip(lower=self.Q1, upper=self.Q3, inplace=True, axis=1)
    else:
        new_X[new_X < self.Q1] = -np.inf
        new_X[new_X > self.Q3] = np.inf            
    return new_X

`EWMAOutOfRange`

Bases: TransformerStep

Compute the EWMA limits and mark as NaN points outside UCL and LCL

Parameters:

Name	Type	Description	Default
`lambda_`	`float`	Parameter for the EWMA, by default 0.5	`0.5`
`return_mask`	`bool`	Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False	`False`
`name`	`Optional[str]`	Name of the step, by default None	`None`

Source code in ceruleo/transformation/features/outliers.py

class EWMAOutOfRange(TransformerStep):
    """
    Compute the EWMA limits  and mark as NaN points outside UCL and LCL

    Parameters:
        lambda_: Parameter for the EWMA, by default 0.5
        return_mask: Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False
        name: Name of the step, by default None
    """

    def __init__(
        self,
        *,
        lambda_ : float=0.5,
        return_mask: bool = False,
        name: Optional[str] = None,
        prefer_partial_fit: bool = False,
    ):
        super().__init__(name=name, prefer_partial_fit=prefer_partial_fit)
        self.lambda_ = lambda_
        self.UCL = None
        self.LCL = None
        self.columns = None
        self.return_mask = return_mask

    def partial_fit(self, X: pd.DataFrame, y=None):
        """
        Compute the EWMA limits incrementally

        Parameters:
            X: Input life
        """
        if self.columns is None:
            self.columns = X.columns.values
        else:
            self.columns = [c for c in self.columns if c in X.columns]
        if self.LCL is not None:
            self.LCL = self.LCL.loc[self.columns].copy()
            self.UCL = self.UCL.loc[self.columns].copy()
        LCL, UCL = self._compute_limits(X[self.columns].copy())
        self.LCL = np.minimum(LCL, self.LCL) if self.LCL is not None else LCL
        self.UCL = np.maximum(UCL, self.UCL) if self.UCL is not None else UCL
        return self

    def _compute_limits(self, X):

        mean = np.nanmean(X, axis=0)
        s = np.sqrt(self.lambda_ / (2 - self.lambda_)) * np.nanstd(X, axis=0)
        UCL = mean + 3 * s
        LCL = mean - 3 * s
        return (pd.Series(LCL, index=self.columns), pd.Series(UCL, index=self.columns))

    def fit(self, X: pd.DataFrame, y=None):
        """
        Compute the EWMA limits

        Parameters:
            X: Input life
        """
        self.columns = X.columns
        LCL, UCL = self._compute_limits(X)
        self.LCL = LCL
        self.UCL = UCL
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        mask = (X[self.columns] < (self.LCL)) | (X[self.columns] > (self.UCL))
        if self.return_mask:
            return mask.astype("int")
        else:
            X = X.copy()
            X[mask] = np.nan
            return X

`fit(X, y=None)`

Compute the EWMA limits

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Source code in ceruleo/transformation/features/outliers.py

def fit(self, X: pd.DataFrame, y=None):
    """
    Compute the EWMA limits

    Parameters:
        X: Input life
    """
    self.columns = X.columns
    LCL, UCL = self._compute_limits(X)
    self.LCL = LCL
    self.UCL = UCL
    return self

`partial_fit(X, y=None)`

Compute the EWMA limits incrementally

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Source code in ceruleo/transformation/features/outliers.py

def partial_fit(self, X: pd.DataFrame, y=None):
    """
    Compute the EWMA limits incrementally

    Parameters:
        X: Input life
    """
    if self.columns is None:
        self.columns = X.columns.values
    else:
        self.columns = [c for c in self.columns if c in X.columns]
    if self.LCL is not None:
        self.LCL = self.LCL.loc[self.columns].copy()
        self.UCL = self.UCL.loc[self.columns].copy()
    LCL, UCL = self._compute_limits(X[self.columns].copy())
    self.LCL = np.minimum(LCL, self.LCL) if self.LCL is not None else LCL
    self.UCL = np.maximum(UCL, self.UCL) if self.UCL is not None else UCL
    return self

`transform(X)`

Remove the outliers from the input life.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Returns:

Type	Description
`DataFrame`	A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    mask = (X[self.columns] < (self.LCL)) | (X[self.columns] > (self.UCL))
    if self.return_mask:
        return mask.astype("int")
    else:
        X = X.copy()
        X[mask] = np.nan
        return X

`IQROutlierRemover`

Bases: TransformerStep

Remove values outside (Q1 - marginIQR, Q2 + marginIQR)

If clip is True the values will be clipped between the range, otherwise the values are going to be replaced by inf and -inf

Parameters:

Name	Type	Description	Default
`lower_quantile`	`float`	Lower quantile threshold for the non-anomalous values, by feault 0.25	`0.25`
`upper_quantile`	`float`	Upper quantile threshold for the non-anomalous values, by feault 0.75	`0.75`
`margin`	`float`	How many times the IQR gets multiplied, by default 0.75	`1.5`
`proportion_to_sample`	`float`	If you want to compute the quantiles in an smaller proportion of data you can specify it,by default 1.0	`1.0`
`clip`	`bool`	Wether to clip the values outside the range, by default False	`False`
`name`	`Optional[str]`	Name of the step, by default None	`None`

Source code in ceruleo/transformation/features/outliers.py

class IQROutlierRemover(TransformerStep):
    """
    Remove values outside (Q1 - margin*IQR, Q2 + margin*IQR)

    If clip is True the values will be clipped between the range,
    otherwise the values are going to be replaced by inf and -inf

    Parameters:
        lower_quantile: Lower quantile threshold for the non-anomalous values, by feault 0.25
        upper_quantile: Upper quantile threshold for the non-anomalous values, by feault 0.75
        margin: How many times the IQR gets multiplied, by default 0.75
        proportion_to_sample: If you want to compute the quantiles in an smaller proportion of data
            you can specify it,by default 1.0
        clip: Wether to clip the values outside the range, by default False
        name: Name of the step, by default None

    """

    def __init__(
        self,
        lower_quantile: float = 0.25,
        upper_quantile: float = 0.75,
        margin: float=1.5,
        proportion_to_sample: float=1.0,
        clip: bool = False,
        name: Optional[str] = None,
        prefer_partial_fit: bool = False,
    ):

        super().__init__(name=name, prefer_partial_fit=prefer_partial_fit)
        self.margin = margin
        self.proportion_to_sample = proportion_to_sample
        self.tdigest_dict = None
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.clip = clip

    def partial_fit(self, X: pd.DataFrame):
        """
        Compute the quantiles of the data and the interquartile range incrementally

        Parameters:
            X: Input life
        """
        if X.shape[0] == 1:
            return self
        if self.proportion_to_sample < 1:
            sampled_points = np.random.choice(
                X.shape[0], int(X.shape[0] * self.proportion_to_sample), replace=False
            )
            X = X.iloc[sampled_points, :]
        if self.tdigest_dict is None:
            self.tdigest_dict = {c: TDigest(100) for c in X.columns}
        for c in X.columns:
            self.tdigest_dict[c] = self.tdigest_dict[c].merge_unsorted(X[c].values)

        self.Q1 = {
            c: self.tdigest_dict[c].estimate_quantile(self.lower_quantile)
            for c in self.tdigest_dict.keys()
        }

        self.Q3 = {
            c: self.tdigest_dict[c].estimate_quantile(self.upper_quantile)
            for c in self.tdigest_dict.keys()
        }

        self.IQR = {c: self.Q3[c] - self.Q1[c] for c in self.Q1.keys()}
        return self

    def fit(self, X: pd.DataFrame):
        """
        Compute the quantiles of the data and the interquartile range incrementally

        Parameters:
            X: Input life
        """
        if self.proportion_to_sample < 1:
            sampled_points = np.random.choice(
                X.shape[0], int(X.shape[0] * self.proportion_to_sample), replace=False
            )
            X = X.iloc[sampled_points, :]
        self.Q1 = X.quantile(self.lower_quantile)
        self.Q3 = X.quantile(self.upper_quantile)
        self.IQR = (self.Q3 - self.Q1).to_dict()
        self.Q1 = self.Q1.to_dict()
        self.Q3 = self.Q3.to_dict()
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life. 

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        X = X.copy()
        check_is_fitted(self, "Q1")
        check_is_fitted(self, "Q3")
        check_is_fitted(self, "IQR")
        for c in X.columns:
            min_value = self.Q1[c] - self.margin * self.IQR[c]
            mask = X[c] < min_value
            if not self.clip:
                X.loc[mask, c] = -np.inf
            else:
                X.loc[mask, c] = min_value
            max_value = self.Q3[c] + self.margin * self.IQR[c]
            mask = X[c] > (max_value)
            if not self.clip:
                X.loc[mask, c] = np.inf
            else:
                X.loc[mask, c] = max_value
        return X

    def description(self):
        name = super().description()
        data = []
        for k in self.Q1.keys():
            data.append((k, {"Q1": self.Q1[k], "Q3": self.Q3[k], "IQR": self.IQR[k]}))
        return (name, data)

`fit(X)`

Compute the quantiles of the data and the interquartile range incrementally

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Source code in ceruleo/transformation/features/outliers.py

def fit(self, X: pd.DataFrame):
    """
    Compute the quantiles of the data and the interquartile range incrementally

    Parameters:
        X: Input life
    """
    if self.proportion_to_sample < 1:
        sampled_points = np.random.choice(
            X.shape[0], int(X.shape[0] * self.proportion_to_sample), replace=False
        )
        X = X.iloc[sampled_points, :]
    self.Q1 = X.quantile(self.lower_quantile)
    self.Q3 = X.quantile(self.upper_quantile)
    self.IQR = (self.Q3 - self.Q1).to_dict()
    self.Q1 = self.Q1.to_dict()
    self.Q3 = self.Q3.to_dict()
    return self

`partial_fit(X)`

Compute the quantiles of the data and the interquartile range incrementally

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Source code in ceruleo/transformation/features/outliers.py

def partial_fit(self, X: pd.DataFrame):
    """
    Compute the quantiles of the data and the interquartile range incrementally

    Parameters:
        X: Input life
    """
    if X.shape[0] == 1:
        return self
    if self.proportion_to_sample < 1:
        sampled_points = np.random.choice(
            X.shape[0], int(X.shape[0] * self.proportion_to_sample), replace=False
        )
        X = X.iloc[sampled_points, :]
    if self.tdigest_dict is None:
        self.tdigest_dict = {c: TDigest(100) for c in X.columns}
    for c in X.columns:
        self.tdigest_dict[c] = self.tdigest_dict[c].merge_unsorted(X[c].values)

    self.Q1 = {
        c: self.tdigest_dict[c].estimate_quantile(self.lower_quantile)
        for c in self.tdigest_dict.keys()
    }

    self.Q3 = {
        c: self.tdigest_dict[c].estimate_quantile(self.upper_quantile)
        for c in self.tdigest_dict.keys()
    }

    self.IQR = {c: self.Q3[c] - self.Q1[c] for c in self.Q1.keys()}
    return self

`transform(X)`

Remove the outliers from the input life.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Returns:

Type	Description
`DataFrame`	A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life. 

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    X = X.copy()
    check_is_fitted(self, "Q1")
    check_is_fitted(self, "Q3")
    check_is_fitted(self, "IQR")
    for c in X.columns:
        min_value = self.Q1[c] - self.margin * self.IQR[c]
        mask = X[c] < min_value
        if not self.clip:
            X.loc[mask, c] = -np.inf
        else:
            X.loc[mask, c] = min_value
        max_value = self.Q3[c] + self.margin * self.IQR[c]
        mask = X[c] > (max_value)
        if not self.clip:
            X.loc[mask, c] = np.inf
        else:
            X.loc[mask, c] = max_value
    return X

`IsolationForestOutlierRemover`

Bases: TransformerStep

Remove outliers using Isolation Forests to detect them.

Parameters:

Name	Type	Description	Default
`n_estimators`		Number of trees in the forest, by default 100	`100`
`name`		Name of the step, by default None	required

Source code in ceruleo/transformation/features/outliers.py

class IsolationForestOutlierRemover(TransformerStep):
    """ 
    Remove outliers using Isolation Forests to detect them.

    Parameters:
        n_estimators: Number of trees in the forest, by default 100
        name: Name of the step, by default None
    """
    def __init__(self, *, n_estimators=100, **kwargs):
        super().__init__(prefer_partial_fit=False, **kwargs)
        self.n_estimators = n_estimators
        self.forests = {}

    def fit(self, X: pd.DataFrame):
        """
        Fit the Isolation Forest model to the data

        Parameters:
            X: Input life
        """
        for c in X.columns:
            self.forests[c] = IsolationForest(n_estimators=self.n_estimators).fit(X[c].values.reshape(-1, 1) )
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        X_new = X.copy()
        for c in X.columns:
            r = self.forests[c].predict(X[c].values.reshape(-1, 1) )
            X_new.loc[r == -1, c] = np.nan
        return X_new

`fit(X)`

Fit the Isolation Forest model to the data

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Source code in ceruleo/transformation/features/outliers.py

def fit(self, X: pd.DataFrame):
    """
    Fit the Isolation Forest model to the data

    Parameters:
        X: Input life
    """
    for c in X.columns:
        self.forests[c] = IsolationForest(n_estimators=self.n_estimators).fit(X[c].values.reshape(-1, 1) )
    return self

`transform(X)`

Remove the outliers from the input life.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Returns:

Type	Description
`DataFrame`	A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    X_new = X.copy()
    for c in X.columns:
        r = self.forests[c].predict(X[c].values.reshape(-1, 1) )
        X_new.loc[r == -1, c] = np.nan
    return X_new

`RollingMeanOutlierRemover`

Bases: TransformerStep

Compute the rolling mean and use it to compute the upper and lower bound to define outliers

Parameters:

Name	Type	Description	Default
`window`	`int`	Window for the rolling mean, by default 15	`15`
`lambda_`	`float`	Multiplier of the std used to define the bounds, by default 3	`3`
`return_mask`	`bool`	Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False	`False`
`name`	`Optional[str]`	Name of the step, by default None	`None`

Source code in ceruleo/transformation/features/outliers.py

class RollingMeanOutlierRemover(TransformerStep):
    """
    Compute the rolling mean and use it to compute the upper and lower bound to define outliers 

    Parameters:
        window: Window for the rolling mean, by default 15
        lambda_: Multiplier of the std used to define the bounds, by default 3
        return_mask: Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False
        name: Name of the step, by default None
    """

    def __init__(
        self,
        *,
        window: int = 15,
        lambda_: float = 3,
        return_mask: bool = False,
        name: Optional[str] = None,
    ):
        super().__init__(name=name)
        self.window = window
        self.lambda_ = lambda_
        self.return_mask = return_mask

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        r = X.rolling(self.window, min_periods=1)
        std = r.quantile(0.75) -  r.quantile(0.25)
        upper = r.median() + (self.lambda_ * std)
        lower = r.median() - (self.lambda_ * std)
        mask = (X > upper) | (X < lower)
        if self.return_mask:
            return mask.astype("int")
        else:
            X = X.copy()
            X[(X > upper)] = np.minimum(upper.values, X) 
            X[(X < upper)] = np.maximum(lower.values, X) 

            #X[mask] = np.nan
            return X

`transform(X)`

Remove the outliers from the input life.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Returns:

Type	Description
`DataFrame`	A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    r = X.rolling(self.window, min_periods=1)
    std = r.quantile(0.75) -  r.quantile(0.25)
    upper = r.median() + (self.lambda_ * std)
    lower = r.median() - (self.lambda_ * std)
    mask = (X > upper) | (X < lower)
    if self.return_mask:
        return mask.astype("int")
    else:
        X = X.copy()
        X[(X > upper)] = np.minimum(upper.values, X) 
        X[(X < upper)] = np.maximum(lower.values, X) 

        #X[mask] = np.nan
        return X

`ZScoreOutlierRemover`

Bases: TransformerStep

Remove values outside (mean - number_of_std_allowedstd, mean + number_of_std_allowedstd). The outliers are set to NaN

Parameters:

Name	Type	Description	Default
`number_of_std_allowed`		Number of standard deviations to consider a point an outlier	required
`name`	`str`	Name of the step, by default None	`None`

Source code in ceruleo/transformation/features/outliers.py

class ZScoreOutlierRemover(TransformerStep):
    """
    Remove values outside (mean - number_of_std_allowed*std, mean + number_of_std_allowed*std). The outliers are set to NaN

    Parameters:
        number_of_std_allowed: Number of standard deviations to consider a point an outlier
        name: Name of the step, by default None
    """
    #X = np.random.rand(500, 5) * np.random.randn(500, 5) * 15
    #imput = ZScoreImputer(1.5)
    #imput.fit(X)
    #X_t = imput.transform(X)

    def __init__(
        self,
        *,
        number_of_std_allowed,
        name: str = None,
        prefer_partial_fit: bool = False,
    ):
        super().__init__(name=name, prefer_partial_fit=prefer_partial_fit)
        self.number_of_std_allowed = number_of_std_allowed
        self.scaler = StandardScaler()

    def fit(self, X: pd.DataFrame):
        """
        Fit a StandardScaler to the data

        Parameters:
            X: Input life
        """
        self.scaler.fit(X)
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        X_new = self.scaler.transform(X)
        X_new[np.abs(X_new) > self.number_of_std_allowed] = np.nan
        return pd.DataFrame(X_new, columns=X.columns, index=X.index)

`fit(X)`

Fit a StandardScaler to the data

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Source code in ceruleo/transformation/features/outliers.py

def fit(self, X: pd.DataFrame):
    """
    Fit a StandardScaler to the data

    Parameters:
        X: Input life
    """
    self.scaler.fit(X)
    return self

`transform(X)`

Remove the outliers from the input life.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	Input life	required

Returns:

Type	Description
`DataFrame`	A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    X_new = self.scaler.transform(X)
    X_new[np.abs(X_new) > self.number_of_std_allowed] = np.nan
    return pd.DataFrame(X_new, columns=X.columns, index=X.index)