Selectors

`ByNameFeatureSelector`

Bases: TransformerStep

Select a subset of feature by name

Parameters:

Name	Type	Description	Default
`features`	`Union[str, List[str]]`	Feature name or List of features name to select	`[]`

Source code in ceruleo/transformation/features/selection.py

class ByNameFeatureSelector(TransformerStep):
    """Select a subset of feature by name

    Parameters:
            features: Feature name or List of features name to select
    """
    def __init__(self, *, features:Union[str, List[str]]= [], name: Optional[str] = None):
        super().__init__(name=name)
        if isinstance(features, str):
            features = [features]
        self.features = features
        self.features_indices = None
        self.features_computed_ = []

    def partial_fit(self, df, y=None):
        if len(self.features) > 0:
            features = [f for f in self.features if f in set(df.columns)]
        else:
            features = list(set(df.columns))

        if len(self.features_computed_) == 0:
            self.features_computed_ = features
        else:
            self.features_computed_ = [
                f for f in self.features_computed_ if f in features
            ]
        return self

    def fit(self, df:pd.DataFrame, y=None):
        """ 
        Find the indices of the features to select

        Parameters:
            df: DataFrame containing the input life
        """
        if len(self.features) > 0:
            features = [f for f in self.features if f in set(df.columns)]
        else:
            features = list(set(df.columns))
        self.features_computed_ = sorted(features)
        return self
        return X.loc[:, self.features_computed_].copy()

    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        """ 
        Transform the input life

        Parameters:
            X: The input life to be transformed

        Returns:
            A new DataFrame containing only the selected features
        """
        return X.loc[:, self.features_computed_].copy()

    @property
    def n_features(self):
        return len(self.features_computed_)

    def description(self):
        name = super().description()
        return (name, self.features_computed_)

    def __str__(self):
        name, f = self.description()
        features = ', '.join(f)[:10]
        return f'{name} : [{features}]'

`fit(df, y=None)`

Find the indices of the features to select

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame containing the input life	required

Source code in ceruleo/transformation/features/selection.py

def fit(self, df:pd.DataFrame, y=None):
    """ 
    Find the indices of the features to select

    Parameters:
        df: DataFrame containing the input life
    """
    if len(self.features) > 0:
        features = [f for f in self.features if f in set(df.columns)]
    else:
        features = list(set(df.columns))
    self.features_computed_ = sorted(features)
    return self
    return X.loc[:, self.features_computed_].copy()

`transform(X)`

Transform the input life

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	The input life to be transformed	required

Returns:

Type	Description
`DataFrame`	A new DataFrame containing only the selected features

Source code in ceruleo/transformation/features/selection.py

def transform(self, X:pd.DataFrame) -> pd.DataFrame:
    """ 
    Transform the input life

    Parameters:
        X: The input life to be transformed

    Returns:
        A new DataFrame containing only the selected features
    """
    return X.loc[:, self.features_computed_].copy()

`ByTypeFeatureSelector`

Bases: TransformerStep

Select a subset of feature by type

Parameters:

Name	Type	Description	Default
`type_`	`Union[str, List]`	Data type to be selected, by default []	`[]`

Source code in ceruleo/transformation/features/selection.py

class ByTypeFeatureSelector(TransformerStep):
    """Select a subset of feature by type

    Parameters:
            type_: Data type to be selected, by default []
    """
    def __init__(self, *, type_:Union[str, List]= [], name: Optional[str] = None):
        super().__init__(name=name)

        self.features = []
        self.type = type_

    def partial_fit(self, df, y=None):
        if len(self.features) == 0:            
            self.features = df.select_dtypes(include=self.type).columns      
        return self

    def fit(self, df, y=None):
        if len(self.features) == 0:
            self.features = df.select_dtypes(include=self.type).columns

        return self

    def transform(self, X:pd.DataFrame) -> pd.DataFrame:
        """
        Transform the input life

        Parameters:
            X: The input life to be transformed

        Returns:
            A new DataFrame containing only the features of the selected type
        """
        return X.loc[:, self.features].copy()

    @property
    def n_features(self):
        return len(self.features)

    def description(self):
        name = super().description()
        return (name, self.features)

    def __str__(self):
        name, f = self.description()
        features = ', '.join(f)[:10]
        return f'{name} : [{features}]'

`transform(X)`

Transform the input life

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	The input life to be transformed	required

Returns:

Type	Description
`DataFrame`	A new DataFrame containing only the features of the selected type

Source code in ceruleo/transformation/features/selection.py

def transform(self, X:pd.DataFrame) -> pd.DataFrame:
    """
    Transform the input life

    Parameters:
        X: The input life to be transformed

    Returns:
        A new DataFrame containing only the features of the selected type
    """
    return X.loc[:, self.features].copy()

`DiscardByNameFeatureSelector`

Bases: TransformerStep

Remove a list of features from the input life

Parameters:

Name	Type	Description	Default
`features`	`List`	List of features to discard	`[]`
`name`	`Optional[str]`	Name of the step, by default None	`None`

Source code in ceruleo/transformation/features/selection.py

class DiscardByNameFeatureSelector(TransformerStep):
    """
    Remove a list of features from the input life

    Parameters:
        features: List of features to discard
        name: Name of the step, by default None
    """
    def __init__(self, *, features: List=[], name: Optional[str] = None):
        super().__init__(name=name)
        self.features = features
        self.features_indices = None

    def fit(self, df:pd.DataFrame, y=None):
        """
        Find the indices of the features to discard

        Parameters:
            df: DataFrame containing the set of features to discard
        """
        self.feature_columns = [f for f in df.columns if f not in self.features]
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """ 
        Transform the input life

        Parameters:
            X: The input life to be transformed

        Returns:
            A new DataFrame containing only the features not in the list of features to discard
        """
        return X.loc[:, self.feature_columns]

    @property
    def n_features(self):
        return len(self.features)

`fit(df, y=None)`

Find the indices of the features to discard

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame containing the set of features to discard	required

Source code in ceruleo/transformation/features/selection.py

def fit(self, df:pd.DataFrame, y=None):
    """
    Find the indices of the features to discard

    Parameters:
        df: DataFrame containing the set of features to discard
    """
    self.feature_columns = [f for f in df.columns if f not in self.features]
    return self

`transform(X)`

Transform the input life

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	The input life to be transformed	required

Returns:

Type	Description
`DataFrame`	A new DataFrame containing only the features not in the list of features to discard

Source code in ceruleo/transformation/features/selection.py

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """ 
    Transform the input life

    Parameters:
        X: The input life to be transformed

    Returns:
        A new DataFrame containing only the features not in the list of features to discard
    """
    return X.loc[:, self.feature_columns]

`MatchFeatureSelector`

Bases: TransformerStep

Select all the features that match a pattern

Parameters:

Name	Type	Description	Default
`pattern`	`str`	Pattern to match	required

Source code in ceruleo/transformation/features/selection.py

class  MatchFeatureSelector(TransformerStep):
    """Select all the features that match a pattern

    Parameters:
        pattern: Pattern to match
    """
    def __init__(self, *, pattern:str, name: Optional[str] = None):
        super().__init__(name=name)
        self.pattern = pattern
        self.selected_columns_ = None


    def partial_fit(self, df: pd.DataFrame, y=None):

        """ 
        Find the features matching the pattern

        Parameters:
            df: DataFrame containing the entire set of features 
        """

        if self.selected_columns_ is None:
            self.selected_columns_ = [f for f in df.columns if self.pattern in f ]

        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """
        Transform the input life

        Parameters:
            X: The input life to be transformed

        Returns:
            A new life with the same index as the input with the missing values replaced by the value in the succesive timestamp 
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input array must be a data frame")
        return X[self.selected_columns_].copy()

`partial_fit(df, y=None)`

Find the features matching the pattern

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame containing the entire set of features	required

Source code in ceruleo/transformation/features/selection.py

def partial_fit(self, df: pd.DataFrame, y=None):

    """ 
    Find the features matching the pattern

    Parameters:
        df: DataFrame containing the entire set of features 
    """

    if self.selected_columns_ is None:
        self.selected_columns_ = [f for f in df.columns if self.pattern in f ]

    return self

`transform(X, y=None)`

Transform the input life

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	The input life to be transformed	required

Returns:

Type	Description
`DataFrame`	A new life with the same index as the input with the missing values replaced by the value in the succesive timestamp

Source code in ceruleo/transformation/features/selection.py

def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """
    Transform the input life

    Parameters:
        X: The input life to be transformed

    Returns:
        A new life with the same index as the input with the missing values replaced by the value in the succesive timestamp 
    """
    if not isinstance(X, pd.DataFrame):
        raise ValueError("Input array must be a data frame")
    return X[self.selected_columns_].copy()

`NullProportionSelector`

Bases: TransformerStep

Remove features with null proportion higher than a threshold inserted in input

Parameters:

Name	Type	Description	Default
`max_null_proportion`	`float`	Maximum null proportion threshold	required
`name`	`Optional[str]`	Name of the step, by default None	`None`

Source code in ceruleo/transformation/features/selection.py

class NullProportionSelector(TransformerStep):
    """ 
    Remove features with null proportion higher than a threshold inserted in input

    Parameters:
        max_null_proportion: Maximum null proportion threshold
        name: Name of the step, by default None
    """
    def __init__(self, *, max_null_proportion: float, name: Optional[str] = None):
        super().__init__(name=name)
        self.max_null_proportion = max_null_proportion
        self.selected_columns_ = None

    def partial_fit(self, X:pd.DataFrame, y=None):
        """ 
        Find the indexes of the features with null proportion lower than the threshold

        Parameters:
            X: DataFrame containing the input life
        """
        null_proportion = X.isnull().mean()

        partial_selected_columns_ = X.columns[
            null_proportion < self.max_null_proportion
        ]
        if (
            self.selected_columns_ is not None
            and len(partial_selected_columns_) < len(self.selected_columns_) * 0.5
        ):
            logger.warning(type(self).__name__)

        if self.selected_columns_ is None:
            self.selected_columns_ = partial_selected_columns_
        else:
            self.selected_columns_ = [
                f for f in self.selected_columns_ if f in partial_selected_columns_
            ]
        if len(self.selected_columns_) == 0:
            logger.warning(type(self).__name__)
            logger.warning("All features were removed")
        return self

    def fit(self, X:pd.DataFrame, y=None):
        """
        Find the indexes of the features with null proportion lower than the threshold

        Parameters:
            X: DataFrame containing the input life
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input array must be a data frame")
        self.null_proportion = X.isnull().mean()
        self.selected_columns_ = X.columns[
            self.null_proportion < self.max_null_proportion
        ]
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """
        Transform the input life

        Parameters:
            X: The input life to be transformed

        Returns:
            A new life containing only the features with null proportion lower than the threshold
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input array must be a data frame")
        return X[self.selected_columns_].copy()

`fit(X, y=None)`

Find the indexes of the features with null proportion lower than the threshold

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	DataFrame containing the input life	required

Source code in ceruleo/transformation/features/selection.py

def fit(self, X:pd.DataFrame, y=None):
    """
    Find the indexes of the features with null proportion lower than the threshold

    Parameters:
        X: DataFrame containing the input life
    """
    if not isinstance(X, pd.DataFrame):
        raise ValueError("Input array must be a data frame")
    self.null_proportion = X.isnull().mean()
    self.selected_columns_ = X.columns[
        self.null_proportion < self.max_null_proportion
    ]
    return self

`partial_fit(X, y=None)`

Find the indexes of the features with null proportion lower than the threshold

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	DataFrame containing the input life	required

Source code in ceruleo/transformation/features/selection.py

def partial_fit(self, X:pd.DataFrame, y=None):
    """ 
    Find the indexes of the features with null proportion lower than the threshold

    Parameters:
        X: DataFrame containing the input life
    """
    null_proportion = X.isnull().mean()

    partial_selected_columns_ = X.columns[
        null_proportion < self.max_null_proportion
    ]
    if (
        self.selected_columns_ is not None
        and len(partial_selected_columns_) < len(self.selected_columns_) * 0.5
    ):
        logger.warning(type(self).__name__)

    if self.selected_columns_ is None:
        self.selected_columns_ = partial_selected_columns_
    else:
        self.selected_columns_ = [
            f for f in self.selected_columns_ if f in partial_selected_columns_
        ]
    if len(self.selected_columns_) == 0:
        logger.warning(type(self).__name__)
        logger.warning("All features were removed")
    return self

`transform(X, y=None)`

Transform the input life

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	The input life to be transformed	required

Returns:

Type	Description
`DataFrame`	A new life containing only the features with null proportion lower than the threshold

Source code in ceruleo/transformation/features/selection.py

def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """
    Transform the input life

    Parameters:
        X: The input life to be transformed

    Returns:
        A new life containing only the features with null proportion lower than the threshold
    """
    if not isinstance(X, pd.DataFrame):
        raise ValueError("Input array must be a data frame")
    return X[self.selected_columns_].copy()

`PandasVarianceThreshold`

Bases: TransformerStep

Remove features with variance lower than a variance threshold inserted in input

Parameters:

Name	Type	Description	Default
`min_variance`	`float`	Minimum variance threshold	required
`name`	`Optional[str]`	Name of the step, by default None	`None`

Source code in ceruleo/transformation/features/selection.py

class PandasVarianceThreshold(TransformerStep):
    """ 
    Remove features with variance lower than a variance threshold inserted in input

    Parameters:
        min_variance: Minimum variance threshold
        name: Name of the step, by default None
    """
    def __init__(self, *, min_variance: float, name: Optional[str] = None):
        super().__init__(name=name)
        self.min_variance = min_variance
        self.selected_columns_ = None

    def partial_fit(self, X:pd.DataFrame, y=None):
        """ 
        Find the indexes of the features with variance higher than the threshold

        Parameters:
            X: DataFrame containing the input life
        """
        variances_ = X.var(skipna=True)
        partial_selected_columns_ = X.columns[variances_ > self.min_variance]
        if (
            self.selected_columns_ is not None
            and len(partial_selected_columns_) < len(self.selected_columns_) * 0.5
        ):
            logger.warning(type(self).__name__)
            logger.warning(
                f"Life removed more than a half of the columns. Shape {X.shape}"
            )
            logger.warning(
                f"Current: {len(self.selected_columns_)}. New ones: {len(partial_selected_columns_)}"
            )
        if self.selected_columns_ is None:
            self.selected_columns_ = partial_selected_columns_
        else:
            self.selected_columns_ = [
                f for f in self.selected_columns_ if f in partial_selected_columns_
            ]
        if len(self.selected_columns_) == 0:
            logger.warning(type(self).__name__)
            logger.warning("All features were removed")
        return self

    def fit(self, X:pd.DataFrame, y=None):
        """ 
        Find the indexes of the features with variance higher than the threshold

        Parameters:
            X: DataFrame containing the input life
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input array must be a data frame")
        self.variances_ = X.var(skipna=True)
        self.selected_columns_ = X.columns[self.variances_ > self.min_variance]
        logger.debug(
            f"Dropped columns {[c for c in X.columns if c not in self.selected_columns_]}"
        )
        return self

    def transform(self, X:pd.DataFrame, y=None) -> pd.DataFrame:
        """ 
        Transform the input life

        Parameters:
            X: The input life to be transformed

        Returns:
            A new life containing only the features with variance higher than the threshold
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input array must be a data frame")
        return X[self.selected_columns_].copy()

`fit(X, y=None)`

Find the indexes of the features with variance higher than the threshold

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	DataFrame containing the input life	required

Source code in ceruleo/transformation/features/selection.py

def fit(self, X:pd.DataFrame, y=None):
    """ 
    Find the indexes of the features with variance higher than the threshold

    Parameters:
        X: DataFrame containing the input life
    """
    if not isinstance(X, pd.DataFrame):
        raise ValueError("Input array must be a data frame")
    self.variances_ = X.var(skipna=True)
    self.selected_columns_ = X.columns[self.variances_ > self.min_variance]
    logger.debug(
        f"Dropped columns {[c for c in X.columns if c not in self.selected_columns_]}"
    )
    return self

`partial_fit(X, y=None)`

Find the indexes of the features with variance higher than the threshold

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	DataFrame containing the input life	required

Source code in ceruleo/transformation/features/selection.py

def partial_fit(self, X:pd.DataFrame, y=None):
    """ 
    Find the indexes of the features with variance higher than the threshold

    Parameters:
        X: DataFrame containing the input life
    """
    variances_ = X.var(skipna=True)
    partial_selected_columns_ = X.columns[variances_ > self.min_variance]
    if (
        self.selected_columns_ is not None
        and len(partial_selected_columns_) < len(self.selected_columns_) * 0.5
    ):
        logger.warning(type(self).__name__)
        logger.warning(
            f"Life removed more than a half of the columns. Shape {X.shape}"
        )
        logger.warning(
            f"Current: {len(self.selected_columns_)}. New ones: {len(partial_selected_columns_)}"
        )
    if self.selected_columns_ is None:
        self.selected_columns_ = partial_selected_columns_
    else:
        self.selected_columns_ = [
            f for f in self.selected_columns_ if f in partial_selected_columns_
        ]
    if len(self.selected_columns_) == 0:
        logger.warning(type(self).__name__)
        logger.warning("All features were removed")
    return self

`transform(X, y=None)`

Transform the input life

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	The input life to be transformed	required

Returns:

Type	Description
`DataFrame`	A new life containing only the features with variance higher than the threshold

Source code in ceruleo/transformation/features/selection.py

def transform(self, X:pd.DataFrame, y=None) -> pd.DataFrame:
    """ 
    Transform the input life

    Parameters:
        X: The input life to be transformed

    Returns:
        A new life containing only the features with variance higher than the threshold
    """
    if not isinstance(X, pd.DataFrame):
        raise ValueError("Input array must be a data frame")
    return X[self.selected_columns_].copy()

`PositionFeatures`

Bases: TransformerStep

Reorder the features of the input life

Parameters:

Name	Type	Description	Default
`features`	`dict`	Dictionary containing the features to reorder and their new position	required
`name`	`Optional[str]`	Name of the step, by default None	`None`

Source code in ceruleo/transformation/features/selection.py

class PositionFeatures(TransformerStep):
    """
    Reorder the features of the input life

    Parameters:
        features: Dictionary containing the features to reorder and their new position
        name: Name of the step, by default None
    """
    def __init__(self, *, features: dict, name: Optional[str] = None):
        super().__init__(name=name)
        self.features = features

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """ 
        Transform the input life by reordering the features

        Parameters:
            X: The input life to be transformed

        Returns:
            A new DataFrame containing the features in the order specified in the constructor
        """
        cols = list(X.columns)
        for name, pos in self.features.items():
            a, b = cols.index(name), pos
            cols[b], cols[a] = cols[a], cols[b]
            X = X[cols]
        return X

`transform(X)`

Transform the input life by reordering the features

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	The input life to be transformed	required

Returns:

Type	Description
`DataFrame`	A new DataFrame containing the features in the order specified in the constructor

Source code in ceruleo/transformation/features/selection.py

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """ 
    Transform the input life by reordering the features

    Parameters:
        X: The input life to be transformed

    Returns:
        A new DataFrame containing the features in the order specified in the constructor
    """
    cols = list(X.columns)
    for name, pos in self.features.items():
        a, b = cols.index(name), pos
        cols[b], cols[a] = cols[a], cols[b]
        X = X[cols]
    return X