Source code for skbold.preproc.label_preproc

# Classes preprocess labels ('y').

# Author: Lukas Snoek [lukassnoek.github.io]
# Contact: lukassnoek@gmail.com
# License: 3 clause BSD

from __future__ import print_function, division, absolute_import
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import scipy.stats as stat


[docs]class LabelFactorizer(BaseEstimator, TransformerMixin): """ Transforms labels according to a given factorial grouping. Factorizes/encodes labels based on part of the string label. For example, the label-vector ['A_1', 'A_2', 'B_1', 'B_2'] can be grouped based on letter (A/B) or number (1/2). Parameters ---------- grouping : List of str List with identifiers for condition names as strings Attributes ---------- new_labels_ : list List with new labels. """ def __init__(self, grouping): self.grouping = grouping self.new_labels_ = None
[docs] def fit(self, y=None, X=None): """ Does nothing, but included to be used in sklearn's Pipeline. """ return self
[docs] def transform(self, y, X=None): """ Transforms label-vector given a grouping. Parameters ---------- y : List/ndarray of str List of ndarray with strings indicating label-names X : ndarray Numeric (float) array of shape = [n_samples, n_features] Returns ------- y_new : ndarray array with transformed y-labels X_new : ndarray array with transformed data of shape = [n_samples, n_features] given new factorial grouping/design. """ y_new = np.zeros(len(y))*-1 self.new_labels_ = np.array(['parsing error!'] * len(y)) all_idx = np.zeros(len(y)) for i, g in enumerate(self.grouping): idx = np.array([g in label for label in y]) y_new[idx] = i self.new_labels_[idx] = g all_idx += idx # Index new labels, y, and X with new factorial labels all_idx = all_idx.astype(bool) y_new = y_new[all_idx] self.new_labels_ = self.new_labels_[all_idx] if X is not None: X_new = X[all_idx, :] return y_new, X_new return y_new
[docs] def get_new_labels(self): """ Returns new labels based on factorization. """ return self.new_labels_
[docs]class MajorityUndersampler(BaseEstimator, TransformerMixin): """ Undersamples the majority-class(es) by selecting random samples. Parameters ---------- verbose : bool Whether to print downsamples number of samples. """
[docs] def __init__(self, verbose=False): """ Initializes MajorityUndersampler object. """ self.verbose = verbose self.idx_ = None
[docs] def fit(self, X=None, y=None): """ Does nothing, but included for scikit-learn pipelines. """ return self
[docs] def transform(self, X, y): """ Downsamples majority-class(es). Parameters ---------- X : ndarray Numeric (float) array of shape = [n_samples, n_features] Returns ------- X : ndarray Transformed array of shape = [n_samples, n_features] given the indices calculated during fit(). """ if isinstance(y[0], (np.float64, np.float32, np.float16)): print('Converting y to integer') y = y.astype(int) bins = np.bincount(y) all_idx = np.zeros(y.size, dtype=bool) for i in np.unique(y): if bins[i] != np.min(bins): y_idx = y == i tmp_idx = np.zeros(y_idx.sum(), dtype=bool) idx_idx = np.random.choice(np.arange(y_idx.sum()), np.min(bins), replace=False) tmp_idx[idx_idx] = True all_idx[y_idx] = tmp_idx else: all_idx[y == i] = True X_ds, y_ds = X[all_idx, :], y[all_idx] if self.verbose: print('Number of samples (after resampling): %.3f' % y_ds.size) print('Resampled class proportion: %.3f\n' % y_ds.mean()) self.idx_ = all_idx return X[all_idx, :], y[all_idx]
[docs]class LabelBinarizer(BaseEstimator, TransformerMixin):
[docs] def __init__(self, params): """ Initializes LabelBinarizer object. """ self.params = params self.idx_ = None self.binarize_params = None
[docs] def fit(self, X=None, y=None): """ Does nothing, but included for scikit-learn pipelines. """ return self
[docs] def transform(self, X, y): """ Binarizes y-attribute. Parameters ---------- X : ndarray Numeric (float) array of shape = [n_samples, n_features] Returns ------- X : ndarray Transformed array of shape = [n_samples, n_features] given the indices calculated during fit(). """ options = ['percentile', 'zscore', 'constant', 'median'] params = self.params if params['type'] == 'percentile': y_rank = [stat.percentileofscore(y, a, 'rank') for a in y] y_rank = np.array(y_rank) idx = (y_rank < params['low']) | (y_rank > params['high']) low = stat.scoreatpercentile(y, params['low']) high = stat.scoreatpercentile(y, params['high']) self.binarize_params = {'type': 'percentile', 'low': low, 'high': high} y = (y_rank[idx] > 50).astype(int) elif params['type'] == 'zscore': y_norm = (y - y.mean()) / y.std() # just to be sure idx = np.abs(y_norm) > params['std'] self.binarize_params = {'type': params['type'], 'mean': y.mean(), 'std': y.std(), 'n_std': params['std']} y = (y_norm[idx] > 0).astype(int) elif params['type'] == 'constant': y = (y > params['cutoff']).astype(int) idx = None self.binarize_params = {'type': params['type'], 'cutoff': params['cutoff']} elif params['type'] == 'median': # median-split median = np.median(y) y = (y > median).astype(int) idx = None self.binarize_params = {'type': params['type'], 'median': median} else: msg = 'Unknown type; please choose from: %r' % options raise KeyError(msg) if idx is not None: X = X[idx, :] self.idx_ = idx return X, y