"""
The confounds module contains code to handle and account for
confounds in pattern analyses.
"""
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
[docs]class ConfoundRegressor(BaseEstimator, TransformerMixin):
""" Fits a confound onto each feature in X and returns their residuals."""
[docs] def __init__(self, confound, X, cross_validate=True, precise=False,
stack_intercept=True):
""" Regresses out a variable (confound) from each feature in X.
Parameters
----------
confound : numpy array
Array of length (n_samples, n_confounds) to regress out of each
feature; May have multiple columns for multiple confounds.
X : numpy array
Array of length (n_samples, n_features), from which the confound
will be regressed. This is used to determine how the
confound-models should be cross-validated (which is necessary
to use in in scikit-learn Pipelines).
cross_validate : bool
Whether to cross-validate the confound-parameters (y~confound)
estimated from the train-set to the test set (cross_validate=True)
or whether to fit the confound regressor separately on the test-set
(cross_validate=False). Setting this parameter to True is equivalent
to "foldwise confound regression" (FwCR) as described in our paper
(https://www.biorxiv.org/content/early/2018/03/28/290684). Setting
this parameter to False, however, is NOT equivalent to "whole
dataset confound regression" (WDCR) as it does not apply confound
regression to the *full* dataset, but simply refits the confound
model on the test-set. We recommend setting this parameter to True.
precise: bool
Transformer-objects in scikit-learn only allow to pass the data
(X) and optionally the target (y) to the fit and transform methods.
However, we need to index the confound accordingly as well. To do so,
we compare the X during initialization (self.X) with the X passed to
fit/transform. As such, we can infer which samples are passed to the
methods and index the confound accordingly. When setting precise to
True, the arrays are compared feature-wise, which is accurate, but
relatively slow. When setting precise to False, it will infer the index
by looking at the sum of all the features, which is less accurate, but much
faster. For dense data, this should work just fine. Also, to aid the
accuracy, we remove the features which are constant (0) across samples.
stack_intercept : bool
Whether to stack an intercept to the confound (default is True)
Attributes
----------
weights_ : numpy array
Array with weights for the confound(s).
"""
self.confound = confound
self.cross_validate = cross_validate
self.X = X
self.precise = precise
self.stack_intercept = stack_intercept
self.weights_ = None
self.nonzero_X_ = None
[docs] def fit(self, X, y=None):
""" Fits the confound-regressor to X.
Parameters
----------
X : numpy array
An array of shape (n_samples, n_features), which should correspond
to your train-set only!
y : None
Included for compatibility; does nothing.
"""
if self.stack_intercept:
icept = np.ones(self.confound.shape[0])
self.confound = np.c_[icept, self.confound]
# Find nonzero voxels (i.e., voxels which have not all zero
# values across samples)
if self.nonzero_X_ is None:
self.nonzero_X_ = np.sum(self.X, axis=0) != 0
self.X = self.X[:, self.nonzero_X_]
X_nz = X[:, self.nonzero_X_]
confound = self.confound
if self.precise:
tmp = np.in1d(self.X, X_nz).reshape(self.X.shape)
fit_idx = tmp.sum(axis=1) == self.X.shape[1]
else:
fit_idx = np.in1d(self.X.sum(axis=1), X_nz.sum(axis=1))
confound_fit = confound[fit_idx, :]
# Vectorized implementation estimating weights for all features
self.weights_ = np.linalg.lstsq(confound_fit, X_nz, rcond=None)[0]
return self