Source code for skbold.preproc.confounds

"""
The confounds module contains code to handle and account for
confounds in pattern analyses.
"""

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


[docs]class ConfoundRegressor(BaseEstimator, TransformerMixin):
    """ Fits a confound onto each feature in X and returns their residuals."""

[docs]    def __init__(self, confound, X, cross_validate=True, precise=False,
                 stack_intercept=True):
        """ Regresses out a variable (confound) from each feature in X.

        Parameters
        ----------
        confound : numpy array
            Array of length (n_samples, n_confounds) to regress out of each
            feature; May have multiple columns for multiple confounds.
        X : numpy array
            Array of length (n_samples, n_features), from which the confound
            will be regressed. This is used to determine how the
            confound-models should be cross-validated (which is necessary
            to use in in scikit-learn Pipelines).
        cross_validate : bool
            Whether to cross-validate the confound-parameters (y~confound)
            estimated from the train-set to the test set (cross_validate=True)
            or whether to fit the confound regressor separately on the test-set
            (cross_validate=False). Setting this parameter to True is equivalent
            to "foldwise confound regression" (FwCR) as described in our paper
            (https://www.biorxiv.org/content/early/2018/03/28/290684). Setting
            this parameter to False, however, is NOT equivalent to "whole
            dataset confound regression" (WDCR) as it does not apply confound
            regression to the *full* dataset, but simply refits the confound
            model on the test-set. We recommend setting this parameter to True.
        precise: bool
            Transformer-objects in scikit-learn only allow to pass the data
            (X) and optionally the target (y) to the fit and transform methods.
            However, we need to index the confound accordingly as well. To do so,
            we compare the X during initialization (self.X) with the X passed to
            fit/transform. As such, we can infer which samples are passed to the
            methods and index the confound accordingly. When setting precise to
            True, the arrays are compared feature-wise, which is accurate, but
            relatively slow. When setting precise to False, it will infer the index
            by looking at the sum of all the features, which is less accurate, but much
            faster. For dense data, this should work just fine. Also, to aid the
            accuracy, we remove the features which are constant (0) across samples.
        stack_intercept : bool
            Whether to stack an intercept to the confound (default is True)

        Attributes
        ----------
        weights_ : numpy array
            Array with weights for the confound(s).
        """

        self.confound = confound
        self.cross_validate = cross_validate
        self.X = X
        self.precise = precise
        self.stack_intercept = stack_intercept
        self.weights_ = None
        self.nonzero_X_ = None

[docs]    def fit(self, X, y=None):
        """ Fits the confound-regressor to X.

        Parameters
        ----------
        X : numpy array
            An array of shape (n_samples, n_features), which should correspond
            to your train-set only!
        y : None
            Included for compatibility; does nothing.
        """

        if self.stack_intercept:
            icept = np.ones(self.confound.shape[0])
            self.confound = np.c_[icept, self.confound]

        # Find nonzero voxels (i.e., voxels which have not all zero
        # values across samples)
        if self.nonzero_X_ is None:
            self.nonzero_X_ = np.sum(self.X, axis=0) != 0
            self.X = self.X[:, self.nonzero_X_]

        X_nz = X[:, self.nonzero_X_]
        confound = self.confound

        if self.precise:
            tmp = np.in1d(self.X, X_nz).reshape(self.X.shape)
            fit_idx = tmp.sum(axis=1) == self.X.shape[1]
        else:
            fit_idx = np.in1d(self.X.sum(axis=1), X_nz.sum(axis=1))

        confound_fit = confound[fit_idx, :]

        # Vectorized implementation estimating weights for all features
        self.weights_ = np.linalg.lstsq(confound_fit, X_nz, rcond=None)[0]
        return self

[docs]    def transform(self, X):
        """ Regresses out confound from X.

        Parameters
        ----------
        X : numpy array
            An array of shape (n_samples, n_features), which should correspond
            to your train-set only!

        Returns
        -------
        X_new : ndarray
            ndarray with confound-regressed features
        """

        if not self.cross_validate:
            self.fit(X)

        X_nz = X[:, self.nonzero_X_]

        if self.precise:
            tmp = np.in1d(self.X, X_nz).reshape(self.X.shape)
            transform_idx = tmp.sum(axis=1) == self.X.shape[1]
        else:
            transform_idx = np.in1d(self.X.sum(axis=1), X_nz.sum(axis=1))

        confound_transform = self.confound[transform_idx]
        X_new = X - confound_transform.dot(self.weights_)
        X_corr = np.zeros_like(X)
        X_corr[:, self.nonzero_X_] = X_new
        return X_corr