Commit 97652d60 authored by Lennart Wittkuhn's avatar Lennart Wittkuhn
Browse files

add behavioral analysis file

parent 10410a62
......@@ -2,4 +2,5 @@
* annex.backend=MD5E
**/.git* annex.largefiles=nothing
CHANGELOG.md annex.largefiles=nothing
README.md annex.largefiles=nothing
\ No newline at end of file
README.md annex.largefiles=nothing
highspeed-analysis.Rproj annex.largefiles=nothing
.Rproj.user
.Rhistory
.RData
.Ruserdata
code/libs
code/*.html
figures
......@@ -2,3 +2,6 @@
path = data/bids
url = https://github.com/lnnrtwttkhn/highspeed-bids
datalad-id = 94ae2510-f811-11ea-a3da-00e04c10078d
[submodule "code/raincloud-plots"]
path = code/raincloud-plots
url = https://github.com/RainCloudPlots/RainCloudPlots.git
This diff is collapsed.
# find the path to the root of this project:
path_root <- rprojroot::find_rstudio_root_file()
source(file.path(path_root, "code", "highspeed-analysis-source.R"))
# reverting to ggplot2 version 3.2.1 for lemon compatibility
# cf. https://github.com/stefanedwards/lemon/issues/20
#devtools::install_version("ggplot2", version = "3.3.0", repos = "http://cran.us.r-project.org")
# create an array with all required packages:
packages <- c("R.matlab", "ggplot2", "dplyr", "ggpubr", "psyphy", "ggm",
"corrplot", "reshape2", "afex", "polycor", "tinytex", "extrafont",
"viridis", "rjson", "jsonlite", "tidyr", "combinat", "rlist",
"lemon", "doMC", "plyr", "styler", "gridExtra", "grid", "report",
"data.table", "NameNeedle", "textreuse", "stringdist", "emmeans",
"RColorBrewer", "tidyverse", "gtools", "cowplot", "emmeans",
"assertr", "lavaan", "rmarkdown", "readr", "caTools", "bitops",
"broom", "ggridges", "nloptr", "devtools", "bookdown", "rstatix",
"lomb")
# load packages using the load_packages function:
load_packages(packages_list = packages)
# specify paths:
source(here::here("code", "highspeed-cluster-permutation.R"))
source(here::here("code", "raincloud-plots", "tutorial_R", "R_rainclouds.R"))
source(here::here("code", "raincloud-plots", "tutorial_R", "summarySE.R"))
# path to figures created by the analysis code:
path_figures <- here::here("figures")
# path to the participants.tsv file (according to BIDS):
# datalad get data/bids/participants.tsv
path_participants <- here::here("data", "bids", "participants.tsv")
# path to the events.tsv files (according to BIDS):
path_events <- here::here("data", "bids", "*", "*", "func", "*events.tsv")
# path to data from the decoding analysis:
path_pred <- here::here("data", "decoding", "*", "data", "*decoding.csv")
# path to the data from the mask thresholding:
path_thresh <- here::here("data", "decoding", "*", "data", "*thresholding.csv")
# path to the data of the voxel patterns:
path_patterns <- here::here("data", "decoding", "*", "data", "*voxel_patterns_union.csv")
# Load all [BIDS](http://bids.neuroimaging.io/) events files:
# read all events files and concatenate them in a data table:
# datalad get data/bids/sub-*/ses-*/func/*events.tsv
dt_events <- do.call(rbind, lapply(Sys.glob(path_events), fread))
# add the cue of the current trial and the response accuracy on a given trial:
dt_events[, by = .(subject, condition, trial), ":=" (
trial_cue = stim_label[!is.na(target) & target != 0],
trial_accuracy = accuracy[!is.na(accuracy)]
)]
# read all decoding data files and concatenate them in a data table:
#dt_pred <- do.call(rbind, lapply(Sys.glob(path_pred), data.table::fread))
# prepare data (add additional variables like a speed trial counter etc.)
#dt_pred <- prepare_data(dt_pred)
# read all data from the mask thresholding:
#dt_thresh = do.call(rbind, lapply(Sys.glob(path_thresh), data.table::fread))
# read all data from the mask thresholding:
#dt_patterns = lapply(Sys.glob(path_patterns), data.table::fread)
# create color list for probabilities of individual sequence events:
color_events <- rev(hcl.colors(5, "Zissou 1"))
# define global lmer settings used in all mixed effects lmer models:
lcctrl <- lmerControl(optimizer = c("bobyqa"), optCtrl = list(maxfun = 500000),
calc.derivs = FALSE)
load_packages <- function(packages_list) {
# install required packages if they are not installed yet:
new_packages <- packages_list[
!(packages_list %in% installed.packages()[, "Package"])]
if (length(new_packages)) install.packages(new_packages)
# load all required packages:
invisible(lapply(packages_list, library, character.only = TRUE))
}
save_datatable <- function(dt, path) {
write.table(dt, file = path, sep = ",", row.names = FALSE)
}
detect_cores <- function() {
# get the maximum number of available cores rstudio has access to:
# We will get the maximum number of computing cores available.
# This information could be used later to parallelize execution when needed.
num_cores <- parallel::detectCores(all.tests = TRUE, logical = TRUE)
sprintf("Number of available computing cores: %d", num_cores)
# use all available cores for processing in r
doMC::registerDoMC(cores = num_cores)
return(num_cores)
}
round_pvalues <- function(pvalue) {
# This function can be used to round p-values.
pvalue_rounded <- vector()
for (p in seq_len(length(pvalue))) {
pvalue_rounded[p] <- format.pval(
pv = pvalue[p], digits = 1, eps = 0.001, nsmall = 2, scientific = FALSE)
if (pvalue_rounded[p] == "<0.001") {
pvalue_rounded[p] <- gsub("<", "p < ", pvalue_rounded[p])
} else {
pvalue_rounded[p] <- paste0("p = ", pvalue_rounded[p])
}
pvalue_rounded[p] <- stringr::str_replace(pvalue_rounded[p], ".0", " ")
}
return(pvalue_rounded)
}
label_fill <- function(original, offset = 0, mod = 2, fill = "") {
# This function can be used to generate axis labels that omit e.g.,
# every second label. Solution was taken from [here](https://bit.ly/2VycSy0).
ii <- as.logical((seq_len(length(original)) - 1 + offset) %% mod)
original[ii] <- fill
return(original)
}
extract_number <- function(mixed_string) {
# this function can be used to extract a number from a mixed string.
number <- regmatches(mixed_string, gregexpr("[[:digit:]]+", mixed_string))
number <- as.numeric(unlist(number))
}
get_labeller <- function(array, suffix = " ms") {
facet_labels_new <- unique(paste0(as.numeric(array) * 1000, suffix))
facet_labels_old <- as.character(unique(array))
names(facet_labels_new) <- facet_labels_old
labeller <- as_labeller(facet_labels_new)
return(labeller)
}
# function to get sequential position and switch to next sequential stimulus:
get_pos <- function(data, events) {
# get the matching subject id:
sub_id <- events$subject == unique(data$id)
# get the matching sequence trial number (trial)
trial <- events$trial == unique(data$trial)
# get the sequence of the current trial:
seq <- events$stim_label[sub_id & trial]
# get only unique elements of the sequence while maintaining their order:
seq_items <- unique(seq)
# get the trial number of the switch to the second item within the sequence:
change <- min(which((seq == seq_items[2]) == TRUE))
# get the sequential position of the current label:
position <- which(seq_items == unique(data$class))
# repeat the sequential position as needed (depending on length of the data):
position <- rep(position, nrow(data))
# repeat the change trial as needed (depending on length of the data):
change <- rep(change, nrow(data))
# get the target cue of the current trial:
trial_cue <- rep(unique(events$trial_cue[sub_id & trial]), nrow(data))
# get the target cue position of the current trial:
tmp_target <- events$target[sub_id & trial]
tmp_position <- events$serial_position[sub_id & trial]
trial_cue_position <- rep(tmp_position[tmp_target == 1], nrow(data))
# get the accuracy of the current trial:
accuracy <- rep(unique(events$trial_accuracy[sub_id & trial]), nrow(data))
# return the position and change indices as result of the function:
return(list(position = position, change = change,
trial_cue = trial_cue, accuracy = accuracy,
trial_cue_position = trial_cue_position))
}
prepare_data <- function(dt) {
library(data.table)
dt <- setDT(dt)
# specify whether one-vs-rest or multi-class classifiers have been used:
dt[, classification := ifelse(classifier == "log_reg", "multi", "ovr")]
# add column to specify the task condition:
dt[grepl("odd", test_set, fixed = TRUE), condition := "oddball"]
dt[grepl("seq", test_set, fixed = TRUE), condition := "sequence"]
dt[grepl("rep", test_set, fixed = TRUE), condition := "repetition"]
dt[grepl("rest", test_set, fixed = TRUE), condition := "rest"]
# display warning if there is any empty row in the task condition column:
if ( any(is.na(dt$condition)) ) warning("missing condition assignment")
# add a within speed condition trial counter across all runs (max should be 15):
dt[, by = .(id, classifier, condition, tITI, class, seq_tr), ":=" (trial_tITI = 1:.N)]
# check if the maximum within speed condition trial counter does not exceed 15:
if( max(subset(dt, condition == "sequence")$trial_tITI) != 15 )
warning('max within speed counter does not equal 15!')
if( max(subset(dt, condition == "repetition")$trial_tITI) != 45 )
warning('max within speed counter does not equal 45!')
# probabilities are normalized for each class within a trial to sum up to 1
dt[, by = .(mask, id, condition, classification, classifier, test_set, session, run_study, tITI, trial, class), ":=" (
probability_norm = probability / sum(probability),
probability_zscore = (probability - mean(probability))/sd(probability),
probability_cum = cumsum(probability) / max(cumsum(probability)))] %>%
# check if the number of TRs match:
verify(.[, by = .(mask, id, condition, classification, classifier, test_set, session, run_study, tITI, trial, class), .(
num_trs = .N
)]$num_trs %in% c(1, 5, 7, 13, 233))
# order sequence trial data by participant, classifier, trial and serial TR:
dt = setorder(dt, mask, id, condition, classification, classifier, tITI, trial, class, seq_tr) %>% setDT(.)
# return the prepared data table:
return(dt)
}
data_summary <- function(x) {
# Function to produce summary statistics (mean and +/- sem)
m <- mean(x)
sem_lower <- m - (sd(x) / sqrt(length(x)))
sem_upper <- m + (sd(x) / sqrt(length(x)))
return(c(y = m, ymin = sem_lower, ymax = sem_upper))
}
round_updown <- function(numbers, base) {
numbers_round <- rep(NA, length(numbers))
for (i in seq_len(length(numbers))) {
if (numbers[i] < 0) {
numbers_round[i] <- -base
} else if (numbers[i] > 0) {
numbers_round[i] <- base
}
}
return(numbers_round)
}
# cluster permutation test in R
# function to find clusters of time-points above a specified threshold with the
# clustering being performed separately for samples with a positive and negative
# value in the time-series.
# references:
# Maris, E., & Oostenveld, R. (2007). Nonparametric statistical testing of EEG- and MEG-data.
# Journal of Neuroscience Methods, 164(1), 177–190. https://doi.org/10.1016/j.jneumeth.2007.03.024
cluster_indices <- function(timeseries, threshold) {
# step 1: clustering
# we cluster adjacent time-samples that all exhibit a similar difference
# (in sign and magnitude)
# initialize an empty cluster index vector depending on the length of the timeseries:
cluster_id = replicate(length(timeseries), 0)
# get positions of cluster with t-values above or below the treshhold
timepoints_thresh = which(abs(timeseries) > threshold)
# set all positions of negative t-values to a negative sign
timepoints = timepoints_thresh * sign(timeseries[timepoints_thresh])
# split the position indices into clusters of consecutive numbers:
clusters = split(abs(timepoints), cumsum(c(1, abs(diff(timepoints)) != 1)))
# write cluster indices into the cluster index vector:
for (i in seq(1,length(clusters))) {
cluster_id[unlist(clusters[i])] = i
}
# return the cluster indices:
return(cluster_id)
}
shuffle_timeseries <- function(timeseries) {
# function to randomly shuffle (i.e., invert a timeseries) through
# multiplication by -1 with a 0.5 probability:
# participant-specific timesources are flipped (multiplied by -1) randomly:
timeseries_shuffled = timeseries * sample(x = c(-1, 1), size = 1)
return(timeseries_shuffled)
}
cluster_stats = function(data, cfg, shuffle = TRUE) {
data_clustered = data %>%
# shuffle time series for each participant, classification and speed
# if shuffle = TRUE. If shuffle = FALSE, the data is just copied.
# check if number of timepoints matches the true number of TRs (here, 13):
.[, by = c(cfg$grouping, "id"), ":=" (
n_trs = .N,
variable_data = if(shuffle) {shuffle_timeseries(get(cfg$variable))} else {get(cfg$variable)}
)] %>% verify(n_trs == cfg$n_trs) %>%
# run a two-sided one-sample t-test against 0 at every TR to get a
# timeseries of t-values (by taking out the participant factor):
# check if there were as many data points as there were participants:
.[, by = c(cfg$grouping, "seq_tr"), .(
num_subs = .N,
tvalue = t.test(variable_data, alternative = "two.sided", mu = cfg$baseline)$statistic
)] %>% #verify(num_subs == 40) %>%
# get the indices for clusters with consecutive t-values above / below threshold
# check if number of timepoints matches the true number of TRs (here, 13):
.[, by = c(cfg$grouping), ":=" (
n_trs = .N,
cluster_id = cluster_indices(timeseries = tvalue, threshold = cfg$threshold)
)] %>% verify(n_trs == cfg$n_trs) %>%
# calculate the cluster mass of each cluster, separately for each
# classification and speed condition:
.[, by = c(cfg$grouping, "cluster_id"), .(
cluster_trs = list(seq_tr),
cluster_length = .N,
cluster_mass = sum(tvalue),
cluster_type = ifelse(sum(tvalue) > 0, "positive", "negative")
)] %>%
# add marker to the maximum absolute cluster mass
# rank the clusters according to their absolute cluster mass:
.[, by = c(cfg$grouping), ":=" (
cluster_max = as.integer(abs(cluster_mass) == max(abs(cluster_mass))),
cluster_rank = rank(abs(cluster_mass))
)]
return(data_clustered)
}
cluster_test <- function(data_true, data_perm, cfg) {
# subset the permutation data frame depending on the grouping:
data_perm_select = data_perm %>%
filter(classification == unique(data_true$classification)) %>%
filter(tITI == unique(data_true$tITI)) %>%
filter(cluster_max == 1) %>%
# extract the cluster mass values of the maximum clusters:
mutate(cluster_mass_perm = cluster_mass * cluster_id)
# get the number of "maximum" clusters that were actually "zero"-clusters:
n_zero = sum(data_perm_select$cluster_id == 0)
# get the number of cluster mass values above the true cluster mass:
n_above = sum(abs(data_perm_select$cluster_mass_perm) > abs(data_true$cluster_mass))
# TODO: add column that specifies if above or below threshold (can be used for later plotting):
# retrieve the number of permutations that was run:
n_perms = nrow(data_perm_select)
# get the monte carlo p-value by dividing:
p_value = n_above/n_perms
return(list("p_value" = p_value, "n_perms" = n_perms, "n_zero" = n_zero))
}
cluster_plot <- function(data_true, data_perm){
plot = ggplot(data_perm, aes(x = abs(as.numeric(cluster_mass)))) +
facet_wrap(facets = ~ as.factor(tITI), labeller = get_labeller(array = data_perm$tITI)) +
geom_histogram(binwidth = 0.5) +
geom_vline(data = data_true, aes(
xintercept = abs(as.numeric(cluster_mass)), color = as.factor(cluster_type))) +
xlab("Absolute cluster mass (summed t-values)") +
ylab("Number of permutations") +
scale_color_discrete(name = "Cluster type") +
coord_capped_cart(left = "both", bottom = "both", expand = TRUE) +
theme(legend.position = c(1, 0), legend.justification = c(1, 0)) +
theme(axis.line = element_line(linetype = "solid", lineend = "round")) +
theme(plot.margin = unit(c(t = 1, r = 3, b = 1, l = 1), "pt"))
return(plot)
}
cluster_permutation <- function(data, cfg) {
# wrapper function to combine all steps of the cluster permutation test
# inputs: data = a data frame; variable = string, variable of interest,
# cfg = list, containing important configuration parameters:
# variables, threshold, baseline, grouping, n_perms, n_trs
# get the cluster statistics for the true data:
data_true = cluster_stats(data, cfg, shuffle = FALSE)
# get the cluster statistics for the permuted data:
data_perm = do.call(rbind, replicate(cfg$n_perms, list(cluster_stats(data, cfg, shuffle = TRUE))))
# compare the cluster statistics of the true data to the permuted data:
data_true = data_true %>%
.[, c("p_value", "n_perms", "n_zero") := cluster_test(
data_true = .SD, data_perm = data_perm, cfg = cfg),
by = c(cfg$grouping, "cluster_id"),
.SDcols = colnames(data_true)] %>%
setorder(classification, tITI, cluster_id) %>%
filter(cluster_id != 0) %>% setDT(.)
# plot the permutation distribution and true data:
plot = cluster_plot(
subset(data_true, classification == "ovr"),
subset(data_perm, classification == "ovr"))
return(list(data_true, plot))
}
Version: 1.0
RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: No
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment