# Load packages for data handling and plotting
library(tidyverse)

# clear global environment (get rid of any old variables that might be hangning around)
rm(list = ls())

# read in the data
df <- read.csv("/Users/ethan/Documents/GitHub/ethanweed.github.io/r-tutorials/data/Stroop-raw-over-the-years.csv")

# make a new dataframe with the same data in the "long" format.
#df <- gather(df, key = "Condition", value = "Time", -Year)

column_means <- colMeans(df)

df$points <- seq(1:nrow(df))

RNI <- ggplot(data = df, aes(x = points, y = Reading_NoInt)) +
  geom_point() + 
  theme_classic() +
  geom_hline(yintercept = column_means[1]) +
  ylim(0, 18)

NI <- ggplot(data = df, aes(x = points, y = Naming_Int)) +
  geom_point() + 
  theme_classic() +
  geom_hline(yintercept = column_means[2]) +
  ylim(0, 18)

NNI <- ggplot(data = df, aes(x = points, y = Naming_NoInt)) +
  geom_point() + 
  theme_classic() +
  geom_hline(yintercept = column_means[3]) +
  ylim(0, 18)
              
RI <- ggplot(data = df, aes(x = points, y = Reading_Int)) +
  geom_point() + 
  theme_classic() +
  geom_hline(yintercept = column_means[4])+
  ylim(0, 18)

library(ggpubr)

ggarrange(RI, RNI, NI, NNI, ncol = 2, nrow = 2)

The variation between the individual data points and the model (the mean) can be described as the standard deviation. \[ s = \sqrt{\frac{\sum_{i=1}^N (x_i - \bar{x})^2}{N-1} } \] In this equation, \(\textstyle\{x_1,\,x_2,\,\ldots,\,x_N\}\) are the observed values of the sample items, \(\textstyle\bar{x}\) is the mean value of these observations, and N is the number of observations in the sample.

“t-test”

Let’s just concetrate on conditions with interference. Is there a significant difference between Reading and Naming?

ggarrange(RI, NI, ncol = 2)

The data from the two conditions look different. But are they really? Put differently: if we model these data as being drawn from two different distributions, does this two-distribution model better describe the data than the alternative, that is, that all the data points are simply drawn from the same distribution?

If we look at the distribution of all the Reading with Interference and Naming with Interference data points, it looks like this:

library(ggpubr)
df_long <- gather(df, key = "Condition", value = "Time", -Year)
df_int <- subset(df_long, Condition == "Reading_Int" | Condition == "Naming_Int")
df_int$Condition <- as.factor(df_int$Condition)

p1 <- ggplot(df_int, aes(Time)) +
  geom_density() +
  theme_classic() +
  labs(title = "Single distribution model")

p1

How would these same data look, if we modelled them as coming from two different distributions?

# make individual plots for different conditions
p2 <- ggplot(df_int, aes(Time, fill = Condition)) +
  geom_density(alpha = 0.5) +
  theme_classic() +
  labs(title = "Two-distribution model")
p2

There is some overlap in data, but the two distributions do appear to be distinct. It looks as if the 2-condition model does fit the data relatively well.

ggarrange(p1, p2, ncol = 2)

This comparison of distributions can be formalized as the “t-test”. In this case, we use a paired t-test, since the same participants were tested in both conditions. Other t-tests include the independent samples t-test for testing e.g. between two different groups of participants, and the single-sample t-test, for testing whether a sample mean is significantly different from a specified number, e.g. different than zero.

library(pander)
Reading <- subset(df_int$Time, df_int$Condition == "Reading_Int")
Naming <- subset(df_int$Time, df_int$Condition == "Naming_Int")
t <- t.test(Reading, Naming, paired = TRUE)
pander(t)
Paired t-test: Reading and Naming (continued below)
Test statistic df P value Alternative hypothesis
-16.77 103 3.337e-31 * * * two.sided
mean of the differences
-3.837

We should be able to take random samples from the same overall distribution and see the same pattern most of the time. Here are 10 random re-samples of 100 data points each from total pool of data.

set.seed(42)
for (i in 1:10) {
  print(paste0("Sample ", i))
  d <- df_int[sample(nrow(df_int), 100), ]
  print(ggplot(d, aes(Time, fill = Condition)) +
  geom_density(alpha = 0.5) +
  theme_classic()) 
}
## [1] "Sample 1"

## [1] "Sample 2"

## [1] "Sample 3"

## [1] "Sample 4"

## [1] "Sample 5"

## [1] "Sample 6"

## [1] "Sample 7"

## [1] "Sample 8"

## [1] "Sample 9"

## [1] "Sample 10"

“ANOVA”

# make individual plots for different conditions
df_long$Condition <- as.factor(df_long$Condition)
df_long <- subset(df_long, Condition != "points")


p1 <- ggplot(df_long, aes(Time)) +
  geom_density() +
  theme_classic() +
  labs(title = "All Data")

p2 <- ggplot(df_long, aes(Time, fill = Condition)) +
  geom_density(alpha = 0.5) +
  theme_classic() +
  labs(title = "Data Modelled by Condition")

ggarrange(p1, p2, ncol = 1, nrow = 2)

# Find mean of all data points
GrandMean <- mean(df_long$Time)

# Find means for each condition
meanNI <- mean(df$Naming_Int)
meanRI <- mean(df$Reading_Int)
meanNNI <- mean(df$Naming_NoInt) 
meanRNI <- mean(df$Reading_NoInt)

# Get x-axis values for each data point, grouped by condition
xNI <- subset(df_long$measurements, df_long$Condition == "Naming_Int")
xRI <- subset(df_long$measurements, df_long$Condition == "Reading_Int")
xNNI <- subset(df_long$measurements, df_long$Condition == "Naming_NoInt")
xRNI <- subset(df_long$measurements, df_long$Condition == "Reading_NoInt")


#yNI <- subset(df_long$Time, df_long$Condition == "Naming_Int") 
#yRI <- subset(df_long$Time, df_long$Condition == "Reading_Int")
#yNNI <- subset(df_long$Time, df_long$Condition == "Naming_NoInt")
#yRNI <- subset(df_long$Time, df_long$Condition == "Reading_NoInt")

# Make a vector of enpoints for the lines from residuals to the model means
model_means <- c(rep(meanRNI, length(xRNI)), rep(meanNI, length(xNI)), rep(meanNNI, length(xNNI)), rep(meanRI, length(xRI)))

# Plot the total sums of squares
p_TSS <- ggplot(df_long, aes(x = measurements, y = Time, color = Condition)) +
  geom_point() +
  geom_segment(aes(xend = measurements, yend = GrandMean), alpha = 0.2) +
  geom_hline(yintercept = GrandMean, linetype="dashed") +
  ylim(0,18) +
  theme_classic() +
  theme(legend.position="none") +
  labs(title = "Total Sums of Squares")

# Plot the residual sums of squares
p_RSS <- ggplot(df_long, aes(x = measurements, y = Time, color = Condition)) +
  geom_point() +
  geom_segment(aes(xend=measurements, yend=model_means), alpha = 0.2) +
  geom_segment(aes(x=min(xNI),xend=max(xNI),y= meanNI,yend=meanNI),colour="black") +
  geom_segment(aes(x=min(xRI),xend=max(xRI),y= meanRI,yend=meanRI),colour="black") +
  geom_segment(aes(x=min(xNNI),xend=max(xNNI),y= meanNNI,yend=meanNNI),colour="black") +
  geom_segment(aes(x=min(xRNI),xend=max(xRNI),y= meanRNI,yend=meanRNI),colour="black") +
  ylim(0,18) +
  theme_classic() +
  theme(legend.position="none") +
  labs(title = "Residual Sums of Squares")

# Plot the model sums of squares
p_MSS <- ggplot(df_long, aes(x = measurements, y = Time, color = Condition)) +
  geom_segment(aes(x=min(xNI),xend=max(xNI),y= meanNI,yend=meanNI),colour="black") +
  geom_segment(aes(x=min(xRI),xend=max(xRI),y= meanRI,yend=meanRI),colour="black") +
  geom_segment(aes(x=min(xNNI),xend=max(xNNI),y= meanNNI,yend=meanNNI),colour="black") +
  geom_segment(aes(x=min(xRNI),xend=max(xRNI),y= meanRNI,yend=meanRNI),colour="black") +
  geom_hline(yintercept = GrandMean, linetype="dashed") +
  geom_segment(aes(x=median(xNI), xend = median(xNI),y = meanNI, yend = GrandMean)) +
  geom_segment(aes(x=median(xRI), xend = median(xRI),y = meanRI, yend = GrandMean)) +
  geom_segment(aes(x=median(xNNI), xend = median(xNNI),y = meanNNI, yend = GrandMean)) +
  geom_segment(aes(x=median(xRNI), xend = median(xRNI),y = meanRNI, yend = GrandMean)) +
  ylim(0,18) +
  theme_classic() +
  theme(legend.position="none") +
  labs(title = "Model Sums of Squares")

# Make a three-panel plot
ggarrange(p_TSS, p_RSS, p_MSS, ncol = 3)

The F-value, the critical statistic in the table below, is the “Mean Square” of the model, dvided by the “Mean Square” of the residuals: \(F=\frac{MS_{model}}{MS_{residual}}\)

# calculate ANOVA
library(lmerTest)

# Define our model. Here, we are predicting Time as modeled by Task, Interference, and the interaction of Task and Interference. We are modelling ID as a "random effect".
mod <- lmer(Time ~ Task + Interference + Task*Interference + (1|ID), data=df_long)
ans <- anova(mod)

Display the ANOVA results in a nice table

# Display the ANOVA results in a nice table
library(pander)
pander(ans)
Type III Analysis of Variance Table with Satterthwaite’s method
  Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Task 693.9 693.9 1 309 476.8 1.378e-64
Interference 561.3 561.3 1 309 385.6 2.7e-56
Task:Interference 163.5 163.5 1 309 112.4 1.35e-22

Display Estimates for Conditions

# Display Estimates for Conditions
library(psycho)
library(pander)
mod_means <- get_means(mod)
pander(mod_means)
Task Interference Mean SE df CI_lower CI_higher
Naming Interference 9.049 0.1401 330.3 8.773 9.324
Reading Interference 5.212 0.1401 330.3 4.936 5.487
Naming No_Interference 5.472 0.1401 330.3 5.196 5.747
Reading No_Interference 4.143 0.1401 330.3 3.867 4.418

Calculate post-hoc contrasts (Tukey)

# Calculate post-hoc contrasts (Tukey)
post_hoc <- get_contrasts(mod)
pander(post_hoc[,c(1,3:6)])
Contrast SE df t p
Naming,Interference - Reading,Interference 0.1673 309 22.93 1.011e-12
Naming,Interference - Naming,No_Interference 0.1673 309 21.38 1.011e-12
Naming,Interference - Reading,No_Interference 0.1673 309 29.33 1.011e-12
Reading,Interference - Naming,No_Interference 0.1673 309 -1.554 0.4066
Reading,Interference - Reading,No_Interference 0.1673 309 6.391 3.646e-09
Naming,No_Interference - Reading,No_Interference 0.1673 309 7.945 1.27e-12