Skip to content

Outliers

Outliers

BeyondQuartileOutlierRemover

Bases: TransformerStep

Remove values outside (Q1, Q3)

If clip is True the values will be clipped between the range, otherwise the values are going to be replaced by inf and -inf

Parameters:

Name Type Description Default
lower_quantile float

Lower quantile threshold for the non-anomalous values, by default 0.25

0.25
upper_quantile float

Upper quantile threshold for the non-anomalous values, by default 0.75

0.75
clip bool

Wether to clip the values outside the range, by default False

False
name Optional[str]

Name of the step, by default None

None
Source code in ceruleo/transformation/features/outliers.py
class BeyondQuartileOutlierRemover(TransformerStep):
    """
    Remove values outside (Q1, Q3)

    If clip is True the values will be clipped between the range,
    otherwise the values are going to be replaced by inf and -inf

    Parameters:
        lower_quantile:  Lower quantile threshold for the non-anomalous values, by default 0.25
        upper_quantile: Upper quantile threshold for the non-anomalous values, by default 0.75
        clip: Wether to clip the values outside the range, by default False
        name: Name of the step, by default None
    """

    def __init__(
        self,
        lower_quantile: float = 0.25,
        upper_quantile: float = 0.75,
        subsample: float = 1.0,
        clip: bool = False,
        name: Optional[str] = None,
        prefer_partial_fit:bool=False
    ):

        super().__init__(name=name, prefer_partial_fit=prefer_partial_fit)
        self.tdigest_dict = None
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.clip = clip
        self.subsample= subsample
        self.Q1 = None
        self.Q3 = None
        self.quantile_estimator = None

    def partial_fit(self, X: pd.DataFrame):
        """
        Compute the quantiles of the data incrementally

        Parameters:
            X: Input life
        """
        if X.shape[0] == 1:
            return self
        if self.quantile_estimator is None:
            self.quantile_estimator = QuantileEstimator(
               tdigest_size=100, subsample=self.subsample
            )

        self.quantile_estimator.update(X.select_dtypes(include="number"))
        return self

    def fit(self, X: pd.DataFrame):
        """
        Compute the quantiles of the data

        Parameters:
            X: Input life
        """
        if self.subsample < 1:

            sampled_points = np.random.choice(
                X.shape[0], int(X.shape[0] * self.subsample), replace=False
            )
            X = X.iloc[sampled_points, :]
        self.Q1 = X.quantile(self.lower_quantile)
        self.Q3 = X.quantile(self.upper_quantile)

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """ 
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """

        if self.Q1 is None:
            self.Q1 = self.quantile_estimator.estimate_quantile(self.lower_quantile)
            self.Q3 = self.quantile_estimator.estimate_quantile(self.upper_quantile)

        new_X = X.copy()


        if self.clip:
            new_X.clip(lower=self.Q1, upper=self.Q3, inplace=True, axis=1)
        else:
            new_X[new_X < self.Q1] = -np.inf
            new_X[new_X > self.Q3] = np.inf            
        return new_X

    def description(self):
        name = super().description()
        data = []
        for k in self.Q1.keys():
            data.append((k, {"Q1": self.Q1[k], "Q3": self.Q3[k], "IQR": self.IQR[k]}))
        return (name, data)

fit(X)

Compute the quantiles of the data

Parameters:

Name Type Description Default
X DataFrame

Input life

required
Source code in ceruleo/transformation/features/outliers.py
def fit(self, X: pd.DataFrame):
    """
    Compute the quantiles of the data

    Parameters:
        X: Input life
    """
    if self.subsample < 1:

        sampled_points = np.random.choice(
            X.shape[0], int(X.shape[0] * self.subsample), replace=False
        )
        X = X.iloc[sampled_points, :]
    self.Q1 = X.quantile(self.lower_quantile)
    self.Q3 = X.quantile(self.upper_quantile)

    return self

partial_fit(X)

Compute the quantiles of the data incrementally

Parameters:

Name Type Description Default
X DataFrame

Input life

required
Source code in ceruleo/transformation/features/outliers.py
def partial_fit(self, X: pd.DataFrame):
    """
    Compute the quantiles of the data incrementally

    Parameters:
        X: Input life
    """
    if X.shape[0] == 1:
        return self
    if self.quantile_estimator is None:
        self.quantile_estimator = QuantileEstimator(
           tdigest_size=100, subsample=self.subsample
        )

    self.quantile_estimator.update(X.select_dtypes(include="number"))
    return self

transform(X)

Remove the outliers from the input life.

Parameters:

Name Type Description Default
X DataFrame

Input life

required

Returns:

Type Description
DataFrame

A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """ 
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """

    if self.Q1 is None:
        self.Q1 = self.quantile_estimator.estimate_quantile(self.lower_quantile)
        self.Q3 = self.quantile_estimator.estimate_quantile(self.upper_quantile)

    new_X = X.copy()


    if self.clip:
        new_X.clip(lower=self.Q1, upper=self.Q3, inplace=True, axis=1)
    else:
        new_X[new_X < self.Q1] = -np.inf
        new_X[new_X > self.Q3] = np.inf            
    return new_X

EWMAOutOfRange

Bases: TransformerStep

Compute the EWMA limits and mark as NaN points outside UCL and LCL

Parameters:

Name Type Description Default
lambda_ float

Parameter for the EWMA, by default 0.5

0.5
return_mask bool

Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False

False
name Optional[str]

Name of the step, by default None

None
Source code in ceruleo/transformation/features/outliers.py
class EWMAOutOfRange(TransformerStep):
    """
    Compute the EWMA limits  and mark as NaN points outside UCL and LCL

    Parameters:
        lambda_: Parameter for the EWMA, by default 0.5
        return_mask: Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False
        name: Name of the step, by default None
    """

    def __init__(
        self,
        *,
        lambda_ : float=0.5,
        return_mask: bool = False,
        name: Optional[str] = None,
        prefer_partial_fit: bool = False,
    ):
        super().__init__(name=name, prefer_partial_fit=prefer_partial_fit)
        self.lambda_ = lambda_
        self.UCL = None
        self.LCL = None
        self.columns = None
        self.return_mask = return_mask

    def partial_fit(self, X: pd.DataFrame, y=None):
        """
        Compute the EWMA limits incrementally

        Parameters:
            X: Input life
        """
        if self.columns is None:
            self.columns = X.columns.values
        else:
            self.columns = [c for c in self.columns if c in X.columns]
        if self.LCL is not None:
            self.LCL = self.LCL.loc[self.columns].copy()
            self.UCL = self.UCL.loc[self.columns].copy()
        LCL, UCL = self._compute_limits(X[self.columns].copy())
        self.LCL = np.minimum(LCL, self.LCL) if self.LCL is not None else LCL
        self.UCL = np.maximum(UCL, self.UCL) if self.UCL is not None else UCL
        return self

    def _compute_limits(self, X):

        mean = np.nanmean(X, axis=0)
        s = np.sqrt(self.lambda_ / (2 - self.lambda_)) * np.nanstd(X, axis=0)
        UCL = mean + 3 * s
        LCL = mean - 3 * s
        return (pd.Series(LCL, index=self.columns), pd.Series(UCL, index=self.columns))

    def fit(self, X: pd.DataFrame, y=None):
        """
        Compute the EWMA limits

        Parameters:
            X: Input life
        """
        self.columns = X.columns
        LCL, UCL = self._compute_limits(X)
        self.LCL = LCL
        self.UCL = UCL
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        mask = (X[self.columns] < (self.LCL)) | (X[self.columns] > (self.UCL))
        if self.return_mask:
            return mask.astype("int")
        else:
            X = X.copy()
            X[mask] = np.nan
            return X

fit(X, y=None)

Compute the EWMA limits

Parameters:

Name Type Description Default
X DataFrame

Input life

required
Source code in ceruleo/transformation/features/outliers.py
def fit(self, X: pd.DataFrame, y=None):
    """
    Compute the EWMA limits

    Parameters:
        X: Input life
    """
    self.columns = X.columns
    LCL, UCL = self._compute_limits(X)
    self.LCL = LCL
    self.UCL = UCL
    return self

partial_fit(X, y=None)

Compute the EWMA limits incrementally

Parameters:

Name Type Description Default
X DataFrame

Input life

required
Source code in ceruleo/transformation/features/outliers.py
def partial_fit(self, X: pd.DataFrame, y=None):
    """
    Compute the EWMA limits incrementally

    Parameters:
        X: Input life
    """
    if self.columns is None:
        self.columns = X.columns.values
    else:
        self.columns = [c for c in self.columns if c in X.columns]
    if self.LCL is not None:
        self.LCL = self.LCL.loc[self.columns].copy()
        self.UCL = self.UCL.loc[self.columns].copy()
    LCL, UCL = self._compute_limits(X[self.columns].copy())
    self.LCL = np.minimum(LCL, self.LCL) if self.LCL is not None else LCL
    self.UCL = np.maximum(UCL, self.UCL) if self.UCL is not None else UCL
    return self

transform(X)

Remove the outliers from the input life.

Parameters:

Name Type Description Default
X DataFrame

Input life

required

Returns:

Type Description
DataFrame

A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    mask = (X[self.columns] < (self.LCL)) | (X[self.columns] > (self.UCL))
    if self.return_mask:
        return mask.astype("int")
    else:
        X = X.copy()
        X[mask] = np.nan
        return X

IQROutlierRemover

Bases: TransformerStep

Remove values outside (Q1 - marginIQR, Q2 + marginIQR)

If clip is True the values will be clipped between the range, otherwise the values are going to be replaced by inf and -inf

Parameters:

Name Type Description Default
lower_quantile float

Lower quantile threshold for the non-anomalous values, by feault 0.25

0.25
upper_quantile float

Upper quantile threshold for the non-anomalous values, by feault 0.75

0.75
margin float

How many times the IQR gets multiplied, by default 0.75

1.5
proportion_to_sample float

If you want to compute the quantiles in an smaller proportion of data you can specify it,by default 1.0

1.0
clip bool

Wether to clip the values outside the range, by default False

False
name Optional[str]

Name of the step, by default None

None
Source code in ceruleo/transformation/features/outliers.py
class IQROutlierRemover(TransformerStep):
    """
    Remove values outside (Q1 - margin*IQR, Q2 + margin*IQR)

    If clip is True the values will be clipped between the range,
    otherwise the values are going to be replaced by inf and -inf

    Parameters:
        lower_quantile: Lower quantile threshold for the non-anomalous values, by feault 0.25
        upper_quantile: Upper quantile threshold for the non-anomalous values, by feault 0.75
        margin: How many times the IQR gets multiplied, by default 0.75
        proportion_to_sample: If you want to compute the quantiles in an smaller proportion of data
            you can specify it,by default 1.0
        clip: Wether to clip the values outside the range, by default False
        name: Name of the step, by default None

    """

    def __init__(
        self,
        lower_quantile: float = 0.25,
        upper_quantile: float = 0.75,
        margin: float=1.5,
        proportion_to_sample: float=1.0,
        clip: bool = False,
        name: Optional[str] = None,
        prefer_partial_fit: bool = False,
    ):

        super().__init__(name=name, prefer_partial_fit=prefer_partial_fit)
        self.margin = margin
        self.proportion_to_sample = proportion_to_sample
        self.tdigest_dict = None
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        self.clip = clip

    def partial_fit(self, X: pd.DataFrame):
        """
        Compute the quantiles of the data and the interquartile range incrementally

        Parameters:
            X: Input life
        """
        if X.shape[0] == 1:
            return self
        if self.proportion_to_sample < 1:
            sampled_points = np.random.choice(
                X.shape[0], int(X.shape[0] * self.proportion_to_sample), replace=False
            )
            X = X.iloc[sampled_points, :]
        if self.tdigest_dict is None:
            self.tdigest_dict = {c: TDigest(100) for c in X.columns}
        for c in X.columns:
            self.tdigest_dict[c] = self.tdigest_dict[c].merge_unsorted(X[c].values)

        self.Q1 = {
            c: self.tdigest_dict[c].estimate_quantile(self.lower_quantile)
            for c in self.tdigest_dict.keys()
        }

        self.Q3 = {
            c: self.tdigest_dict[c].estimate_quantile(self.upper_quantile)
            for c in self.tdigest_dict.keys()
        }

        self.IQR = {c: self.Q3[c] - self.Q1[c] for c in self.Q1.keys()}
        return self

    def fit(self, X: pd.DataFrame):
        """
        Compute the quantiles of the data and the interquartile range incrementally

        Parameters:
            X: Input life
        """
        if self.proportion_to_sample < 1:
            sampled_points = np.random.choice(
                X.shape[0], int(X.shape[0] * self.proportion_to_sample), replace=False
            )
            X = X.iloc[sampled_points, :]
        self.Q1 = X.quantile(self.lower_quantile)
        self.Q3 = X.quantile(self.upper_quantile)
        self.IQR = (self.Q3 - self.Q1).to_dict()
        self.Q1 = self.Q1.to_dict()
        self.Q3 = self.Q3.to_dict()
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life. 

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        X = X.copy()
        check_is_fitted(self, "Q1")
        check_is_fitted(self, "Q3")
        check_is_fitted(self, "IQR")
        for c in X.columns:
            min_value = self.Q1[c] - self.margin * self.IQR[c]
            mask = X[c] < min_value
            if not self.clip:
                X.loc[mask, c] = -np.inf
            else:
                X.loc[mask, c] = min_value
            max_value = self.Q3[c] + self.margin * self.IQR[c]
            mask = X[c] > (max_value)
            if not self.clip:
                X.loc[mask, c] = np.inf
            else:
                X.loc[mask, c] = max_value
        return X

    def description(self):
        name = super().description()
        data = []
        for k in self.Q1.keys():
            data.append((k, {"Q1": self.Q1[k], "Q3": self.Q3[k], "IQR": self.IQR[k]}))
        return (name, data)

fit(X)

Compute the quantiles of the data and the interquartile range incrementally

Parameters:

Name Type Description Default
X DataFrame

Input life

required
Source code in ceruleo/transformation/features/outliers.py
def fit(self, X: pd.DataFrame):
    """
    Compute the quantiles of the data and the interquartile range incrementally

    Parameters:
        X: Input life
    """
    if self.proportion_to_sample < 1:
        sampled_points = np.random.choice(
            X.shape[0], int(X.shape[0] * self.proportion_to_sample), replace=False
        )
        X = X.iloc[sampled_points, :]
    self.Q1 = X.quantile(self.lower_quantile)
    self.Q3 = X.quantile(self.upper_quantile)
    self.IQR = (self.Q3 - self.Q1).to_dict()
    self.Q1 = self.Q1.to_dict()
    self.Q3 = self.Q3.to_dict()
    return self

partial_fit(X)

Compute the quantiles of the data and the interquartile range incrementally

Parameters:

Name Type Description Default
X DataFrame

Input life

required
Source code in ceruleo/transformation/features/outliers.py
def partial_fit(self, X: pd.DataFrame):
    """
    Compute the quantiles of the data and the interquartile range incrementally

    Parameters:
        X: Input life
    """
    if X.shape[0] == 1:
        return self
    if self.proportion_to_sample < 1:
        sampled_points = np.random.choice(
            X.shape[0], int(X.shape[0] * self.proportion_to_sample), replace=False
        )
        X = X.iloc[sampled_points, :]
    if self.tdigest_dict is None:
        self.tdigest_dict = {c: TDigest(100) for c in X.columns}
    for c in X.columns:
        self.tdigest_dict[c] = self.tdigest_dict[c].merge_unsorted(X[c].values)

    self.Q1 = {
        c: self.tdigest_dict[c].estimate_quantile(self.lower_quantile)
        for c in self.tdigest_dict.keys()
    }

    self.Q3 = {
        c: self.tdigest_dict[c].estimate_quantile(self.upper_quantile)
        for c in self.tdigest_dict.keys()
    }

    self.IQR = {c: self.Q3[c] - self.Q1[c] for c in self.Q1.keys()}
    return self

transform(X)

Remove the outliers from the input life.

Parameters:

Name Type Description Default
X DataFrame

Input life

required

Returns:

Type Description
DataFrame

A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life. 

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    X = X.copy()
    check_is_fitted(self, "Q1")
    check_is_fitted(self, "Q3")
    check_is_fitted(self, "IQR")
    for c in X.columns:
        min_value = self.Q1[c] - self.margin * self.IQR[c]
        mask = X[c] < min_value
        if not self.clip:
            X.loc[mask, c] = -np.inf
        else:
            X.loc[mask, c] = min_value
        max_value = self.Q3[c] + self.margin * self.IQR[c]
        mask = X[c] > (max_value)
        if not self.clip:
            X.loc[mask, c] = np.inf
        else:
            X.loc[mask, c] = max_value
    return X

IsolationForestOutlierRemover

Bases: TransformerStep

Remove outliers using Isolation Forests to detect them.

Parameters:

Name Type Description Default
n_estimators

Number of trees in the forest, by default 100

100
name

Name of the step, by default None

required
Source code in ceruleo/transformation/features/outliers.py
class IsolationForestOutlierRemover(TransformerStep):
    """ 
    Remove outliers using Isolation Forests to detect them.

    Parameters:
        n_estimators: Number of trees in the forest, by default 100
        name: Name of the step, by default None
    """
    def __init__(self, *, n_estimators=100, **kwargs):
        super().__init__(prefer_partial_fit=False, **kwargs)
        self.n_estimators = n_estimators
        self.forests = {}

    def fit(self, X: pd.DataFrame):
        """
        Fit the Isolation Forest model to the data

        Parameters:
            X: Input life
        """
        for c in X.columns:
            self.forests[c] = IsolationForest(n_estimators=self.n_estimators).fit(X[c].values.reshape(-1, 1) )
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        X_new = X.copy()
        for c in X.columns:
            r = self.forests[c].predict(X[c].values.reshape(-1, 1) )
            X_new.loc[r == -1, c] = np.nan
        return X_new

fit(X)

Fit the Isolation Forest model to the data

Parameters:

Name Type Description Default
X DataFrame

Input life

required
Source code in ceruleo/transformation/features/outliers.py
def fit(self, X: pd.DataFrame):
    """
    Fit the Isolation Forest model to the data

    Parameters:
        X: Input life
    """
    for c in X.columns:
        self.forests[c] = IsolationForest(n_estimators=self.n_estimators).fit(X[c].values.reshape(-1, 1) )
    return self

transform(X)

Remove the outliers from the input life.

Parameters:

Name Type Description Default
X DataFrame

Input life

required

Returns:

Type Description
DataFrame

A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    X_new = X.copy()
    for c in X.columns:
        r = self.forests[c].predict(X[c].values.reshape(-1, 1) )
        X_new.loc[r == -1, c] = np.nan
    return X_new

RollingMeanOutlierRemover

Bases: TransformerStep

Compute the rolling mean and use it to compute the upper and lower bound to define outliers

Parameters:

Name Type Description Default
window int

Window for the rolling mean, by default 15

15
lambda_ float

Multiplier of the std used to define the bounds, by default 3

3
return_mask bool

Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False

False
name Optional[str]

Name of the step, by default None

None
Source code in ceruleo/transformation/features/outliers.py
class RollingMeanOutlierRemover(TransformerStep):
    """
    Compute the rolling mean and use it to compute the upper and lower bound to define outliers 

    Parameters:
        window: Window for the rolling mean, by default 15
        lambda_: Multiplier of the std used to define the bounds, by default 3
        return_mask: Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False
        name: Name of the step, by default None
    """

    def __init__(
        self,
        *,
        window: int = 15,
        lambda_: float = 3,
        return_mask: bool = False,
        name: Optional[str] = None,
    ):
        super().__init__(name=name)
        self.window = window
        self.lambda_ = lambda_
        self.return_mask = return_mask

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        r = X.rolling(self.window, min_periods=1)
        std = r.quantile(0.75) -  r.quantile(0.25)
        upper = r.median() + (self.lambda_ * std)
        lower = r.median() - (self.lambda_ * std)
        mask = (X > upper) | (X < lower)
        if self.return_mask:
            return mask.astype("int")
        else:
            X = X.copy()
            X[(X > upper)] = np.minimum(upper.values, X) 
            X[(X < upper)] = np.maximum(lower.values, X) 

            #X[mask] = np.nan
            return X

transform(X)

Remove the outliers from the input life.

Parameters:

Name Type Description Default
X DataFrame

Input life

required

Returns:

Type Description
DataFrame

A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    r = X.rolling(self.window, min_periods=1)
    std = r.quantile(0.75) -  r.quantile(0.25)
    upper = r.median() + (self.lambda_ * std)
    lower = r.median() - (self.lambda_ * std)
    mask = (X > upper) | (X < lower)
    if self.return_mask:
        return mask.astype("int")
    else:
        X = X.copy()
        X[(X > upper)] = np.minimum(upper.values, X) 
        X[(X < upper)] = np.maximum(lower.values, X) 

        #X[mask] = np.nan
        return X

ZScoreOutlierRemover

Bases: TransformerStep

Remove values outside (mean - number_of_std_allowedstd, mean + number_of_std_allowedstd). The outliers are set to NaN

Parameters:

Name Type Description Default
number_of_std_allowed

Number of standard deviations to consider a point an outlier

required
name str

Name of the step, by default None

None
Source code in ceruleo/transformation/features/outliers.py
class ZScoreOutlierRemover(TransformerStep):
    """
    Remove values outside (mean - number_of_std_allowed*std, mean + number_of_std_allowed*std). The outliers are set to NaN

    Parameters:
        number_of_std_allowed: Number of standard deviations to consider a point an outlier
        name: Name of the step, by default None
    """
    #X = np.random.rand(500, 5) * np.random.randn(500, 5) * 15
    #imput = ZScoreImputer(1.5)
    #imput.fit(X)
    #X_t = imput.transform(X)

    def __init__(
        self,
        *,
        number_of_std_allowed,
        name: str = None,
        prefer_partial_fit: bool = False,
    ):
        super().__init__(name=name, prefer_partial_fit=prefer_partial_fit)
        self.number_of_std_allowed = number_of_std_allowed
        self.scaler = StandardScaler()

    def fit(self, X: pd.DataFrame):
        """
        Fit a StandardScaler to the data

        Parameters:
            X: Input life
        """
        self.scaler.fit(X)
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Remove the outliers from the input life.

        Parameters:
            X: Input life

        Returns:
            A new DataFrame with the outliers removed
        """
        X_new = self.scaler.transform(X)
        X_new[np.abs(X_new) > self.number_of_std_allowed] = np.nan
        return pd.DataFrame(X_new, columns=X.columns, index=X.index)

fit(X)

Fit a StandardScaler to the data

Parameters:

Name Type Description Default
X DataFrame

Input life

required
Source code in ceruleo/transformation/features/outliers.py
def fit(self, X: pd.DataFrame):
    """
    Fit a StandardScaler to the data

    Parameters:
        X: Input life
    """
    self.scaler.fit(X)
    return self

transform(X)

Remove the outliers from the input life.

Parameters:

Name Type Description Default
X DataFrame

Input life

required

Returns:

Type Description
DataFrame

A new DataFrame with the outliers removed

Source code in ceruleo/transformation/features/outliers.py
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Remove the outliers from the input life.

    Parameters:
        X: Input life

    Returns:
        A new DataFrame with the outliers removed
    """
    X_new = self.scaler.transform(X)
    X_new[np.abs(X_new) > self.number_of_std_allowed] = np.nan
    return pd.DataFrame(X_new, columns=X.columns, index=X.index)