Modelling the Stroop Data

# Load packages for data handling and plotting
library(tidyverse)

# clear global environment (get rid of any old variables that might be hangning around)
rm(list = ls())

# read in the data
df <- read.csv("/Users/ethan/Documents/GitHub/ethanweed.github.io/r-tutorials/data/Stroop-raw-over-the-years.csv")

# make a new dataframe with the same data in the "long" format.
#df <- gather(df, key = "Condition", value = "Time", -Year)

column_means <- colMeans(df)

df$points <- seq(1:nrow(df))

RNI <- ggplot(data = df, aes(x = points, y = Reading_NoInt)) +
  geom_point() + 
  theme_classic() +
  geom_hline(yintercept = column_means[1]) +
  ylim(0, 18)

NI <- ggplot(data = df, aes(x = points, y = Naming_Int)) +
  geom_point() + 
  theme_classic() +
  geom_hline(yintercept = column_means[2]) +
  ylim(0, 18)

NNI <- ggplot(data = df, aes(x = points, y = Naming_NoInt)) +
  geom_point() + 
  theme_classic() +
  geom_hline(yintercept = column_means[3]) +
  ylim(0, 18)
              
RI <- ggplot(data = df, aes(x = points, y = Reading_Int)) +
  geom_point() + 
  theme_classic() +
  geom_hline(yintercept = column_means[4])+
  ylim(0, 18)

library(ggpubr)

ggarrange(RI, RNI, NI, NNI, ncol = 2, nrow = 2)

The variation between the individual data points and the model (the mean) can be described as the standard deviation. \[ s = \sqrt{\frac{\sum_{i=1}^N (x_i - \bar{x})^2}{N-1} } \] In this equation, \(\textstyle\{x_1,\,x_2,\,\ldots,\,x_N\}\) are the observed values of the sample items, \(\textstyle\bar{x}\) is the mean value of these observations, and N is the number of observations in the sample.

“t-test”

Let’s just concetrate on conditions with interference. Is there a significant difference between Reading and Naming?

ggarrange(RI, NI, ncol = 2)

The data from the two conditions look different. But are they really? Put differently: if we model these data as being drawn from two different distributions, does this two-distribution model better describe the data than the alternative, that is, that all the data points are simply drawn from the same distribution?

If we look at the distribution of all the Reading with Interference and Naming with Interference data points, it looks like this:

library(ggpubr)
df_long <- gather(df, key = "Condition", value = "Time", -Year)
df_int <- subset(df_long, Condition == "Reading_Int" | Condition == "Naming_Int")
df_int$Condition <- as.factor(df_int$Condition)

p1 <- ggplot(df_int, aes(Time)) +
  geom_density() +
  theme_classic() +
  labs(title = "Single distribution model")

p1

How would these same data look, if we modelled them as coming from two different distributions?

# make individual plots for different conditions
p2 <- ggplot(df_int, aes(Time, fill = Condition)) +
  geom_density(alpha = 0.5) +
  theme_classic() +
  labs(title = "Two-distribution model")
p2

There is some overlap in data, but the two distributions do appear to be distinct. It looks as if the 2-condition model does fit the data relatively well.

ggarrange(p1, p2, ncol = 2)

This comparison of distributions can be formalized as the “t-test”. In this case, we use a paired t-test, since the same participants were tested in both conditions. Other t-tests include the independent samples t-test for testing e.g. between two different groups of participants, and the single-sample t-test, for testing whether a sample mean is significantly different from a specified number, e.g. different than zero.

library(pander)
Reading <- subset(df_int$Time, df_int$Condition == "Reading_Int")
Naming <- subset(df_int$Time, df_int$Condition == "Naming_Int")
t <- t.test(Reading, Naming, paired = TRUE)
pander(t)

Paired t-test: `Reading` and `Naming` (continued below)
Test statistic	df	P value	Alternative hypothesis
-16.77	103	3.337e-31 * * *	two.sided

mean of the differences
-3.837

We should be able to take random samples from the same overall distribution and see the same pattern most of the time. Here are 10 random re-samples of 100 data points each from total pool of data.

set.seed(42)
for (i in 1:10) {
  print(paste0("Sample ", i))
  d <- df_int[sample(nrow(df_int), 100), ]
  print(ggplot(d, aes(Time, fill = Condition)) +
  geom_density(alpha = 0.5) +
  theme_classic()) 
}

## [1] "Sample 1"

## [1] "Sample 2"

## [1] "Sample 3"

## [1] "Sample 4"

## [1] "Sample 5"

## [1] "Sample 6"

## [1] "Sample 7"

## [1] "Sample 8"

## [1] "Sample 9"

## [1] "Sample 10"

“ANOVA”

# make individual plots for different conditions
df_long$Condition <- as.factor(df_long$Condition)
df_long <- subset(df_long, Condition != "points")


p1 <- ggplot(df_long, aes(Time)) +
  geom_density() +
  theme_classic() +
  labs(title = "All Data")

p2 <- ggplot(df_long, aes(Time, fill = Condition)) +
  geom_density(alpha = 0.5) +
  theme_classic() +
  labs(title = "Data Modelled by Condition")

ggarrange(p1, p2, ncol = 1, nrow = 2)

# Find mean of all data points
GrandMean <- mean(df_long$Time)

# Find means for each condition
meanNI <- mean(df$Naming_Int)
meanRI <- mean(df$Reading_Int)
meanNNI <- mean(df$Naming_NoInt) 
meanRNI <- mean(df$Reading_NoInt)

# Get x-axis values for each data point, grouped by condition
xNI <- subset(df_long$measurements, df_long$Condition == "Naming_Int")
xRI <- subset(df_long$measurements, df_long$Condition == "Reading_Int")
xNNI <- subset(df_long$measurements, df_long$Condition == "Naming_NoInt")
xRNI <- subset(df_long$measurements, df_long$Condition == "Reading_NoInt")


#yNI <- subset(df_long$Time, df_long$Condition == "Naming_Int") 
#yRI <- subset(df_long$Time, df_long$Condition == "Reading_Int")
#yNNI <- subset(df_long$Time, df_long$Condition == "Naming_NoInt")
#yRNI <- subset(df_long$Time, df_long$Condition == "Reading_NoInt")

# Make a vector of enpoints for the lines from residuals to the model means
model_means <- c(rep(meanRNI, length(xRNI)), rep(meanNI, length(xNI)), rep(meanNNI, length(xNNI)), rep(meanRI, length(xRI)))

# Plot the total sums of squares
p_TSS <- ggplot(df_long, aes(x = measurements, y = Time, color = Condition)) +
  geom_point() +
  geom_segment(aes(xend = measurements, yend = GrandMean), alpha = 0.2) +
  geom_hline(yintercept = GrandMean, linetype="dashed") +
  ylim(0,18) +
  theme_classic() +
  theme(legend.position="none") +
  labs(title = "Total Sums of Squares")

# Plot the residual sums of squares
p_RSS <- ggplot(df_long, aes(x = measurements, y = Time, color = Condition)) +
  geom_point() +
  geom_segment(aes(xend=measurements, yend=model_means), alpha = 0.2) +
  geom_segment(aes(x=min(xNI),xend=max(xNI),y= meanNI,yend=meanNI),colour="black") +
  geom_segment(aes(x=min(xRI),xend=max(xRI),y= meanRI,yend=meanRI),colour="black") +
  geom_segment(aes(x=min(xNNI),xend=max(xNNI),y= meanNNI,yend=meanNNI),colour="black") +
  geom_segment(aes(x=min(xRNI),xend=max(xRNI),y= meanRNI,yend=meanRNI),colour="black") +
  ylim(0,18) +
  theme_classic() +
  theme(legend.position="none") +
  labs(title = "Residual Sums of Squares")

# Plot the model sums of squares
p_MSS <- ggplot(df_long, aes(x = measurements, y = Time, color = Condition)) +
  geom_segment(aes(x=min(xNI),xend=max(xNI),y= meanNI,yend=meanNI),colour="black") +
  geom_segment(aes(x=min(xRI),xend=max(xRI),y= meanRI,yend=meanRI),colour="black") +
  geom_segment(aes(x=min(xNNI),xend=max(xNNI),y= meanNNI,yend=meanNNI),colour="black") +
  geom_segment(aes(x=min(xRNI),xend=max(xRNI),y= meanRNI,yend=meanRNI),colour="black") +
  geom_hline(yintercept = GrandMean, linetype="dashed") +
  geom_segment(aes(x=median(xNI), xend = median(xNI),y = meanNI, yend = GrandMean)) +
  geom_segment(aes(x=median(xRI), xend = median(xRI),y = meanRI, yend = GrandMean)) +
  geom_segment(aes(x=median(xNNI), xend = median(xNNI),y = meanNNI, yend = GrandMean)) +
  geom_segment(aes(x=median(xRNI), xend = median(xRNI),y = meanRNI, yend = GrandMean)) +
  ylim(0,18) +
  theme_classic() +
  theme(legend.position="none") +
  labs(title = "Model Sums of Squares")

# Make a three-panel plot
ggarrange(p_TSS, p_RSS, p_MSS, ncol = 3)

The F-value, the critical statistic in the table below, is the “Mean Square” of the model, dvided by the “Mean Square” of the residuals: \(F=\frac{MS_{model}}{MS_{residual}}\)

# calculate ANOVA
library(lmerTest)

# Define our model. Here, we are predicting Time as modeled by Task, Interference, and the interaction of Task and Interference. We are modelling ID as a "random effect".
mod <- lmer(Time ~ Task + Interference + Task*Interference + (1|ID), data=df_long)
ans <- anova(mod)

Display the ANOVA results in a nice table

# Display the ANOVA results in a nice table
library(pander)
pander(ans)

Type III Analysis of Variance Table with Satterthwaite’s method
	Sum Sq	Mean Sq	NumDF	DenDF	F value	Pr(>F)
Task	693.9	693.9	1	309	476.8	1.378e-64
Interference	561.3	561.3	1	309	385.6	2.7e-56
Task:Interference	163.5	163.5	1	309	112.4	1.35e-22

Print text that we can copy and paste into our report

# Print text that we can copy and paste into our report
library(psycho)
res <- analyze(ans)
print(res)

##    - The effect of Task is significant (F(1, 309) = 476.77, p < .001).
##    - The effect of Interference is significant (F(1, 309) = 385.63, p < .001).
##    - The interaction between Task and Interference is significant (F(1, 309) = 112.36, p < .001).

Display Estimates for Conditions

# Display Estimates for Conditions
library(psycho)
library(pander)
mod_means <- get_means(mod)
pander(mod_means)

Task	Interference	Mean	SE	df	CI_lower	CI_higher
Naming	Interference	9.049	0.1401	330.3	8.773	9.324
Reading	Interference	5.212	0.1401	330.3	4.936	5.487
Naming	No_Interference	5.472	0.1401	330.3	5.196	5.747
Reading	No_Interference	4.143	0.1401	330.3	3.867	4.418

Calculate post-hoc contrasts (Tukey)

# Calculate post-hoc contrasts (Tukey)
post_hoc <- get_contrasts(mod)
pander(post_hoc[,c(1,3:6)])

Contrast	SE	df	t	p
Naming,Interference - Reading,Interference	0.1673	309	22.93	1.011e-12
Naming,Interference - Naming,No_Interference	0.1673	309	21.38	1.011e-12
Naming,Interference - Reading,No_Interference	0.1673	309	29.33	1.011e-12
Reading,Interference - Naming,No_Interference	0.1673	309	-1.554	0.4066
Reading,Interference - Reading,No_Interference	0.1673	309	6.391	3.646e-09
Naming,No_Interference - Reading,No_Interference	0.1673	309	7.945	1.27e-12