Source code for skbold.preproc.label_preproc
# Classes preprocess labels ('y').
# Author: Lukas Snoek [lukassnoek.github.io]
# Contact: lukassnoek@gmail.com
# License: 3 clause BSD
from __future__ import print_function, division, absolute_import
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import scipy.stats as stat
[docs]class LabelFactorizer(BaseEstimator, TransformerMixin):
""" Transforms labels according to a given factorial grouping.
Factorizes/encodes labels based on part of the string label. For example,
the label-vector ['A_1', 'A_2', 'B_1', 'B_2'] can be grouped
based on letter (A/B) or number (1/2).
Parameters
----------
grouping : List of str
List with identifiers for condition names as strings
Attributes
----------
new_labels_ : list
List with new labels.
"""
def __init__(self, grouping):
self.grouping = grouping
self.new_labels_ = None
[docs] def fit(self, y=None, X=None):
""" Does nothing, but included to be used in sklearn's Pipeline. """
return self
[docs] def transform(self, y, X=None):
""" Transforms label-vector given a grouping.
Parameters
----------
y : List/ndarray of str
List of ndarray with strings indicating label-names
X : ndarray
Numeric (float) array of shape = [n_samples, n_features]
Returns
-------
y_new : ndarray
array with transformed y-labels
X_new : ndarray
array with transformed data of shape = [n_samples, n_features]
given new factorial grouping/design.
"""
y_new = np.zeros(len(y))*-1
self.new_labels_ = np.array(['parsing error!'] * len(y))
all_idx = np.zeros(len(y))
for i, g in enumerate(self.grouping):
idx = np.array([g in label for label in y])
y_new[idx] = i
self.new_labels_[idx] = g
all_idx += idx
# Index new labels, y, and X with new factorial labels
all_idx = all_idx.astype(bool)
y_new = y_new[all_idx]
self.new_labels_ = self.new_labels_[all_idx]
if X is not None:
X_new = X[all_idx, :]
return y_new, X_new
return y_new
[docs] def get_new_labels(self):
""" Returns new labels based on factorization. """
return self.new_labels_
[docs]class MajorityUndersampler(BaseEstimator, TransformerMixin):
"""
Undersamples the majority-class(es) by selecting random samples.
Parameters
----------
verbose : bool
Whether to print downsamples number of samples.
"""
[docs] def __init__(self, verbose=False):
""" Initializes MajorityUndersampler object. """
self.verbose = verbose
self.idx_ = None
[docs] def fit(self, X=None, y=None):
""" Does nothing, but included for scikit-learn pipelines. """
return self
[docs] def transform(self, X, y):
""" Downsamples majority-class(es).
Parameters
----------
X : ndarray
Numeric (float) array of shape = [n_samples, n_features]
Returns
-------
X : ndarray
Transformed array of shape = [n_samples, n_features] given the
indices calculated during fit().
"""
if isinstance(y[0], (np.float64, np.float32, np.float16)):
print('Converting y to integer')
y = y.astype(int)
bins = np.bincount(y)
all_idx = np.zeros(y.size, dtype=bool)
for i in np.unique(y):
if bins[i] != np.min(bins):
y_idx = y == i
tmp_idx = np.zeros(y_idx.sum(), dtype=bool)
idx_idx = np.random.choice(np.arange(y_idx.sum()),
np.min(bins), replace=False)
tmp_idx[idx_idx] = True
all_idx[y_idx] = tmp_idx
else:
all_idx[y == i] = True
X_ds, y_ds = X[all_idx, :], y[all_idx]
if self.verbose:
print('Number of samples (after resampling): %.3f' % y_ds.size)
print('Resampled class proportion: %.3f\n' % y_ds.mean())
self.idx_ = all_idx
return X[all_idx, :], y[all_idx]
[docs]class LabelBinarizer(BaseEstimator, TransformerMixin):
[docs] def __init__(self, params):
""" Initializes LabelBinarizer object. """
self.params = params
self.idx_ = None
self.binarize_params = None
[docs] def fit(self, X=None, y=None):
""" Does nothing, but included for scikit-learn pipelines. """
return self
[docs] def transform(self, X, y):
""" Binarizes y-attribute.
Parameters
----------
X : ndarray
Numeric (float) array of shape = [n_samples, n_features]
Returns
-------
X : ndarray
Transformed array of shape = [n_samples, n_features] given the
indices calculated during fit().
"""
options = ['percentile', 'zscore', 'constant', 'median']
params = self.params
if params['type'] == 'percentile':
y_rank = [stat.percentileofscore(y, a, 'rank') for a in y]
y_rank = np.array(y_rank)
idx = (y_rank < params['low']) | (y_rank > params['high'])
low = stat.scoreatpercentile(y, params['low'])
high = stat.scoreatpercentile(y, params['high'])
self.binarize_params = {'type': 'percentile',
'low': low,
'high': high}
y = (y_rank[idx] > 50).astype(int)
elif params['type'] == 'zscore':
y_norm = (y - y.mean()) / y.std() # just to be sure
idx = np.abs(y_norm) > params['std']
self.binarize_params = {'type': params['type'],
'mean': y.mean(),
'std': y.std(),
'n_std': params['std']}
y = (y_norm[idx] > 0).astype(int)
elif params['type'] == 'constant':
y = (y > params['cutoff']).astype(int)
idx = None
self.binarize_params = {'type': params['type'],
'cutoff': params['cutoff']}
elif params['type'] == 'median': # median-split
median = np.median(y)
y = (y > median).astype(int)
idx = None
self.binarize_params = {'type': params['type'],
'median': median}
else:
msg = 'Unknown type; please choose from: %r' % options
raise KeyError(msg)
if idx is not None:
X = X[idx, :]
self.idx_ = idx
return X, y