Source code for skbold.postproc.prevalence

from __future__ import division, print_function, absolute_import

from builtins import range
import os.path as op
import numpy as np
import nibabel as nib
from sklearn.externals import joblib
from scipy import stats
from tqdm import trange
from warnings import filterwarnings

filterwarnings(action='ignore', category=RuntimeWarning)

[docs]class PrevalenceInference(object):
    """
    Class that performs PrevalenceInference based on the paper by Allefeld,
    Gorgen, & Haynes (2016), NeuroImage.

    Parameters
    ----------
    obs : numpy ndarray
        A 2D array of shape [N (subjects) x K (voxels)], or a 1D array of shape
        [N, 1].
    perms : numpy ndarray
        A 3D array of shape [N (subjects) x K (voxels) x P1 (first level
        permutations)], or a 2D array of shape [N x P1]
    P2 : int
        Number of second level permutations to run
    gamma0 : float
        What prevalence inference null (gamma < gamma0) to test
    alpha : float
        Significance level for hypothesis testing

    Examples
    --------
    >>> from skbold.postproc import PrevalenceInference
    >>> import numpy as np
    >>> N, K, P1 = 20, (40, 40, 38), 15
    >>> obs = np.random.normal(loc=0.55, scale=0.05, size=(N, np.prod(K)))
    >>> perms = np.random.normal(loc=0.5, scale=0.05, size=(N, np.prod(K), P1))
    >>> pvi = PrevalenceInference(obs=obs, perms=perms, P2=100000, gamma0=05,
                                  alpha=0.05)
    >>> pvi.run()
    Running with parameters:
	N = 20
	K = 60800
	P1 = 15
	P2 = 100000
    """

[docs]    def __init__(self, obs, perms, P2=100000, gamma0=0.5, alpha=0.05):
        """ Initializes PrevalenceInference object."""

        self.obs = obs
        self.perms = perms
        self.P2 = P2
        self.gamma0 = gamma0
        self.alpha = alpha
        self.N = None
        self.K = None
        self.P1 = None

        print("This is experimental functionality! (i.e., not yet fully tested"
              " through)")

    def _check_inputs(self):
        """ Checks and validates inputs data and extracts parameters. """

        # Remove singleton dimensions
        self.obs = self.obs.squeeze()
        self.perms = self.perms.squeeze()

        if self.perms.ndim > 3:
            raise ValueError("Your array should be 2D or 3D!")

        if self.obs.ndim == 2:  # Assume we're dealing with multiple voxels
            self.K = self.obs.shape[1]
            self._create_mask()  # To remove NaNs and such
        else:  # We just have a single score
            self.K = 1
            # Add singleton axis to make calculations more parsimonious
            self.perms = self.perms[:, np.newaxis, :]

        if self.obs.ndim > 2:
            msg = "Observed values (obs) should be 1 or 2 dimensional!"
            raise ValueError(msg)

        self.N = self.obs.shape[0]
        self.P1 = self.perms.shape[-1]

        print("Running with parameters:\n\tN = %i\n\tK = %i\n\tP1 = %i\n\tP2 = %i"
              % (self.N, self.K, self.P1, self.P2))

    def _create_mask(self):
        """ Removes all zero or NaN voxels. """
        # Stack all data together
        all_data = np.dstack((self.obs[:, :, np.newaxis], self.perms))

        # Voxels should be neither zero nor NaN
        mask = np.logical_and(~(np.isnan(all_data).sum(axis=(0, 2)) > 0),
                              ~((all_data == 0.0).sum(axis=(0, 2)) > 0))
        all_data_nonzero = all_data[:, mask, :]  # index with mask

        # Unstack obs and perms
        self.obs = all_data_nonzero[:, :, 0]
        self.perms = all_data_nonzero[:, :, 1:]
        print("Found %i non-zero voxels" % mask.sum())

[docs]    def run(self):
        """ Runs actual prevalence inference algorithm. """

        self._check_inputs()

        # Shorten parameters for clarity
        N, K, P1, P2, alpha, gamma0 = (self.N, self.K, self.P1, self.P2,
                                       self.alpha, self.gamma0)

        m = np.min(self.obs, axis=0)
        u_rank = np.zeros(K)

        if K > 1:
            c_rank = np.zeros(K)

        for j in trange(P2):  # Loop for second level permutations
            these_perms = np.vstack([self.perms[k, :, np.random.choice(np.arange(P1))]
                                     for k in range(N)])
            min_vals = these_perms.min(axis=0)
            u_rank += m <= min_vals  # Update uncorrected values

            if K > 1:  # Update corrected values
                c_rank += m <= min_vals.max()

        # Calculate statistics!
        # - pu_GN = pvalue uncorrected Global Null,
        # - pu_MN = pvalue uncorrected Majority Null,
        # - pc_MN = pvalue corrected Majority Null,
        # - gamma0_u = largest gamma0 given alpha (uncorrected)
        # - gamma0_max_u = largest possible gamma0 given N, P2, and alpha (uncorrected)
        pu_GN = (1 + u_rank) / (P2 + 1)
        pu_MN = ((1 - gamma0) * pu_GN ** (1 / N) + gamma0) ** N
        gamma0_u = (alpha ** (1 / N) - pu_GN ** (1 / N)) / (1 - pu_GN ** (1 / N))
        gamma0_u[alpha < pu_GN] = 0
        gamma0_max_u = (alpha ** (1 / N) - 1 / P2 ** (1 / N)) / (1 - 1 / P2 ** (1 / N))

        self.pu_GN = pu_GN
        self.pu_MN = pu_MN
        self.gamma0_u = gamma0_u
        self.gamma0_max_u = gamma0_max_u

        # Some extra corrected statistics
        # - pc_GN = pvalue corrected Global Null
        # - pc_GN = pvalue corrected Global Null
        # - gamma0_c = largest gamma0 given alpha (corrected)
        # - gamma0_max_c = largest possible gamma0 given N, P2, and alpha (corrected)
        if K > 1:
            pc_GN = (1 + c_rank) / (P2 + 1)
            pc_MN = pc_GN + (1 - pc_GN) * pu_MN
            alpha_c = (alpha - pc_GN) / (1 - pc_GN)
            alpha_c[np.isinf(alpha_c)] = 0
            alpha_c[alpha_c <= 0.0] = 0
            gamma0_c = (alpha_c ** (1 / N) - pu_GN ** (1 / N)) / (1 - pu_GN ** (1 / N))
            gamma0_c[alpha_c < pu_GN] = 0
            alpha_max_c = (alpha - 1 / P2) / (1 - 1 / P2)
            gamma0_max_c = (alpha_max_c ** (1 / N) - 1 / P2 ** (1 / N)) / (1 - 1 / P2 ** (1 / N))
            self.pc_GN = pc_GN
            self.pc_MN = pc_MN
            self.gamma0_c = gamma0_c
            self.gamma0_max_c = gamma0_max_c

        # Median scores of observed values
        self.score_typical = np.median(self.obs, axis=0)

[docs]    def write(self, path):
        """ Writes results from Prevalence Inference procedure to disk.

        Parameters
        ----------
        path : str
            Where to write the results to disk
        """

        pass