Source code for skbold.feature_selection.filters

# Class to wrap univariate-feature selection methods in.
# Selects features based on {ufs_method}.scores_ > cutoff.

# Author: Lukas Snoek [lukassnoek.github.io]
# Contact: lukassnoek@gmail.com
# License: 3 clause BSD

from __future__ import division, print_function, absolute_import
from sklearn.feature_selection.univariate_selection import (_BaseFilter,
                                                            check_is_fitted,
                                                            SelectPercentile,
                                                            SelectFwe,
                                                            SelectFpr,
                                                            SelectFdr,
                                                            SelectKBest)
from sklearn.feature_selection import f_classif
import numpy as np


[docs]class SelectAboveCutoff(_BaseFilter):
    """ Filter: Select features with a score above some cutoff.

    Parameters
    ----------
    cutoff : int/float
        Cutoff for feature-scores to be selected.
    score_func : callable
        Function that takes a 2D array X (samples x features) and returns a
        score reflecting a univariate difference (higher is better).
    """

    def __init__(self, cutoff, score_func=f_classif):
        super(SelectAboveCutoff, self).__init__(score_func)
        self.cutoff = cutoff

    def _get_support_mask(self):
        check_is_fitted(self, 'scores_')

        if self.scores_.ndim > 1:
            # if at least one column is True, select the feature
            idx = np.sum(self.scores_ > self.cutoff, axis=0)
        else:
            idx = self.scores_ > self.cutoff

        return idx


[docs]class GenericUnivariateSelect(_BaseFilter):
    """ Univariate feature selector with configurable strategy.

    Updated version from scikit-learn: http://scikit-learn.org/`.

    Parameters
    ----------
    score_func : callable
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues). For modes 'percentile' or 'kbest' it can return
        a single array scores.
    mode : {'percentile', 'k_best', 'fpr', 'fdr', 'fwe', 'cutoff'}
        Feature selection mode.
    param : float or int depending on the feature selection mode
        Parameter of the corresponding mode.

    Attributes
    ----------
    scores_ : array-like, shape=(n_features,)
        Scores of features.
    pvalues_ : array-like, shape=(n_features,)
        p-values of feature scores, None if `score_func` returned scores only.
    """

    _selection_modes = {'percentile': SelectPercentile,
                        'k_best': SelectKBest,
                        'fpr': SelectFpr,
                        'fdr': SelectFdr,
                        'fwe': SelectFwe,
                        'cutoff': SelectAboveCutoff}

    def __init__(self, score_func=f_classif, mode='percentile', param=1e-5):
        super(GenericUnivariateSelect, self).__init__(score_func)
        self.mode = mode
        self.param = param

    def _make_selector(self):
        selector = self._selection_modes[self.mode](score_func=self.score_func)

        # Now perform some acrobatics to set the right named parameter in
        # the selector
        possible_params = selector._get_param_names()
        possible_params.remove('score_func')
        selector.set_params(**{possible_params[0]: self.param})

        return selector

    def _check_params(self, X, y):
        if self.mode not in self._selection_modes:
            raise ValueError("The mode passed should be one of %s, %r,"
                             " (type %s) was passed."
                             % (self._selection_modes.keys(), self.mode,
                                type(self.mode)))

        self._make_selector()._check_params(X, y)

    def _get_support_mask(self):
        check_is_fitted(self, 'scores_')

        selector = self._make_selector()
        selector.pvalues_ = self.pvalues_
        selector.scores_ = self.scores_
        return selector._get_support_mask()