Source code for skbold.exp_model.parse_presentation_logfile

# Parses a Presentation (neurobs.com) logfile.

# Author: Lukas Snoek [lukassnoek.github.io]
# Contact: lukassnoek@gmail.com
# License: 3 clause BSD

from __future__ import division, print_function, absolute_import
from builtins import range
import os
import os.path as op
import pandas as pd
import numpy as np
from glob import glob


[docs]class PresentationLogfileCrawler(object):
    """
    Logfile crawler for Presentation (Neurobs) files; cleans logfile,
    calculates event onsets and durations, and (optionally) writes out
    .bfsl files per condition.

    Parameters
    ----------
    in_file : str or list
        Absolute path to logfile (can be a list of paths).
    con_names : list
        List with names for each condition
    con_codes : list
        List with codes for conditions. Can be a single integer or string (in
        the latter case, it may be a substring) or a list with possible values.
    con_design : list or str
        Which 'design' to assume for events (if 'multivar', all events -
        regardless of condition - are treated as a separate
        condition/regressor; if 'univar', all events from a single condition
        are treated as a single condition). Default: 'univar' for all
        conditions.
    con_duration : list
        If the duration cannot be parsed from the logfile, you can specify them
        here manually (per condition).
    pulsecode : int
        Code with which the first (or any) pulse is logged.
    write_bfsl : bool
        Whether to write out a .bfsl file per condition.
    verbose : bool
        Print out intermediary output.

    Attributes
    ----------
    df : Dataframe
        Dataframe with cleaned and parsed logfile.
    """

    def __init__(self, in_file, con_names, con_codes, con_design=None,
                 con_duration=None, pulsecode=30, write_bfsl=False,
                 verbose=True):

        if isinstance(in_file, str):
            in_file = [in_file]

        self.in_file = in_file
        self.con_names = con_names
        self.con_codes = con_codes

        if con_duration is not None:

            if isinstance(con_duration, (int, float)):
                con_duration = [con_duration]

            if len(con_duration) < len(con_names):
                con_duration *= len(con_names)

        self.con_duration = con_duration

        design_params = ['univar', 'multivar', None]
        msg = 'Unknown design-parameter; please choose from: %r' % \
              design_params
        if isinstance(con_design, str):
            if con_design not in design_params:
                raise ValueError(msg)

        elif isinstance(con_design, list):

            if not all(d in design_params for d in con_design):
                raise ValueError(msg)

        else:
            msg = 'Unknown type for con_design; please specify list or str.'
            raise ValueError(msg)

        if con_design is None:
            con_design = ['univar'] * len(con_names)

        self.con_design = con_design
        self.pulsecode = pulsecode
        self.write_bfsl = write_bfsl
        self.verbose = verbose
        self.df = None
        self.to_write = None
        self.base_dir = None

    def _parse(self, f):

        if self.verbose:
            print('Processing %s' % f)

        # Remove existing .bfsl files
        self.base_dir = op.dirname(f)
        _ = [os.remove(x) for x in glob(op.join(self.base_dir, '*.bfsl'))]

        if self.df is not None:
            df = self.df
        else:
            df = pd.read_table(f, sep='\t', skiprows=3,
                               skip_blank_lines=True)

        # Clean up unnecessary columns
        to_drop = ['Uncertainty', 'Subject', 'Trial', 'Uncertainty.1',
                   'ReqTime', 'ReqDur', 'Stim Type', 'Pair Index']
        _ = [df.drop(col, axis=1, inplace=True)
             for col in to_drop if col in df.columns]

        # Ugly hack to find pulsecode, because some numeric codes are
        # written as str
        df['Code'] = df['Code'].astype(str)
        df['Code'] = [np.float(x) if x.isdigit() else x for x in df['Code']]
        pulse_idx = np.where(df['Code'] == self.pulsecode)[0]

        if len(pulse_idx) > 1:  # take first pulse if mult pulses are logged
            pulse_idx = int(pulse_idx[0])

        # pulse_t = absolute time of first pulse
        pulse_t = df['Time'][df['Code'] == self.pulsecode].iloc[0]
        df['Time'] = (df['Time'] - float(pulse_t)) / 10000.0
        df['Duration'] /= 10000.0

        trial_names = []
        trial_onsets = []
        trial_durations = []

        # Loop over condition-codes to find indices/times/durations
        for i, code in enumerate(self.con_codes):

            to_write = pd.DataFrame()

            if type(code) == str:
                code = [code]

            if len(code) > 1:
                # Code is list of possibilities
                if all(isinstance(c, int) for c in code):
                    idx = df['Code'].isin(code)

                elif all(isinstance(c, str) for c in code):
                    idx = [any(c in x for c in code)
                           if isinstance(x, str) else False
                           for x in df['Code']]
                    idx = np.array(idx)

            elif len(code) == 1 and type(code[0]) == str:
                # Code is single string
                idx = [code[0] in x if type(x) == str
                       else False for x in df['Code']]
                idx = np.array(idx)
            else:
                idx = df['Code'] == code

            if idx.sum() == 0:
                raise ValueError('No entries found for code: %r' % code)

            # Generate dataframe with time, duration, and weight given idx
            to_write['Time'] = df['Time'][idx]

            if self.con_duration is None:
                to_write['Duration'] = df['Duration'][idx]
                n_nan = np.sum(np.isnan(to_write['Duration']))
                if n_nan > 1:
                    msg = ('In total, %i NaNs found for Duration. '
                           'Specify duration manually.' % n_nan)
                    raise ValueError(msg)
                to_write['Duration'] = [np.round(x, decimals=2)
                                        for x in to_write['Duration']]
            else:
                to_write['Duration'] = [self.con_duration[i]] * idx.sum()

            to_write['Weight'] = np.ones((np.sum(idx), 1))
            to_write['Name'] = [self.con_names[i] + '_%i' % (j + 1)
                                for j in range(idx.sum())]

            if self.con_design[i] == 'univar':
                trial_names.append(to_write['Name'].tolist())
                trial_onsets.append(to_write['Time'].tolist())
                trial_durations.append(to_write['Duration'].tolist())
            elif self.con_design[i] == 'multivar':
                _ = [trial_names.append([x])
                     for x in to_write['Name'].tolist()]
                _ = [trial_onsets.append([x])
                     for x in to_write['Time'].tolist()]
                _ = [trial_durations.append([x])
                     for x in to_write['Duration'].tolist()]

            self.to_write = to_write

            if self.write_bfsl:
                self._write_bfsl(i)

        subject_info = {'conditions': self.con_names,
                        'onsets': trial_onsets,
                        'durations': trial_durations,
                        'amplitudes': None,
                        'regressor_names': self.con_names,
                        'regressors': None}

        # For nipype: convert subject-info to Bunch instance.

        return subject_info

    def _write_bfsl(self, i):

        to_write = self.to_write

        if self.con_design[i] == 'univar':
            to_write.drop('Name', axis=1, inplace=True)
            name = op.join(self.base_dir, '%s.bfsl' % self.con_names[i])
            to_write = to_write[['Time', 'Duration', 'Weight']]
            to_write.to_csv(name, sep='\t', index=False, header=False)

        elif self.con_design[i] == 'multivar':

            for ii, (index, row) in enumerate(to_write.iterrows()):
                ev_name = '%s.bfsl' % row['Name']
                name = os.path.join(self.base_dir, ev_name)
                df_tmp = pd.DataFrame({'Time': row['Time'],
                                       'Duration': row['Duration'],
                                       'Weight': row['Weight']}, index=[0])
                df_tmp = df_tmp[['Time', 'Duration', 'Weight']]
                df_tmp.to_csv(name, index=False, sep='\t', header=False)

[docs]    def parse(self):
        """
        Parses logfile, writes bfsl (optional), and return subject-info.

        Returns
        -------
        subject_info_list : Nilearn bunch object
            Bunch object to be used in Nipype pipelines.
        """
        subject_info_list = [self._parse(f) for f in self.in_file]

        if len(subject_info_list) == 1:
            return subject_info_list[0]
        else:
            return subject_info_list


[docs]def parse_presentation_logfile(in_file, con_names, con_codes, con_design=None,
                               con_duration=None, pulsecode=30):
    """
    Function-interface for PresentationLogfileCrawler. Can be used to create
    a Nipype node.

    Parameters
    ----------
    in_file : str or list
        Absolute path to logfile (can be a list of paths).
    con_names : list
        List with names for each condition
    con_codes : list
        List with codes for conditions. Can be a single integer or string (in
        the latter case, it may be a substring) or a list with possible values.
    con_design : list or str
        Which 'design' to assume for events (if 'multivar', all events -
        regardless of condition - are treated as a separate
        condition/regressor; if 'univar', all events from a single condition
        are treated as a single condition). Default: 'univar' for all
        conditions.
    con_duration : list
        If the duration cannot be parsed from the logfile, you can specify them
        here manually (per condition).
    pulsecode : int
        Code with which the first (or any) pulse is logged.
    """

    from skbold.exp_model import PresentationLogfileCrawler

    plc = PresentationLogfileCrawler(in_file=in_file, con_names=con_names,
                                     con_codes=con_codes,
                                     con_design=con_design,
                                     con_duration=con_duration,
                                     pulsecode=pulsecode, write_bfsl=True,
                                     verbose=False)

    subject_info_files = plc.parse()

    return subject_info_files