# Load packages for data handling and plotting
library(tidyverse)
# clear global environment (get rid of any old variables that might be hangning around)
rm(list = ls())
# read in the data
df <- read.csv("/Users/ethan/Documents/GitHub/ethanweed.github.io/r-tutorials/data/Stroop-raw-over-the-years.csv")
# make a new dataframe with the same data in the "long" format.
#df <- gather(df, key = "Condition", value = "Time", -Year)
column_means <- colMeans(df)
df$points <- seq(1:nrow(df))
RNI <- ggplot(data = df, aes(x = points, y = Reading_NoInt)) +
geom_point() +
theme_classic() +
geom_hline(yintercept = column_means[1]) +
ylim(0, 18)
NI <- ggplot(data = df, aes(x = points, y = Naming_Int)) +
geom_point() +
theme_classic() +
geom_hline(yintercept = column_means[2]) +
ylim(0, 18)
NNI <- ggplot(data = df, aes(x = points, y = Naming_NoInt)) +
geom_point() +
theme_classic() +
geom_hline(yintercept = column_means[3]) +
ylim(0, 18)
RI <- ggplot(data = df, aes(x = points, y = Reading_Int)) +
geom_point() +
theme_classic() +
geom_hline(yintercept = column_means[4])+
ylim(0, 18)
library(ggpubr)
ggarrange(RI, RNI, NI, NNI, ncol = 2, nrow = 2)
The variation between the individual data points and the model (the mean) can be described as the standard deviation. \[ s = \sqrt{\frac{\sum_{i=1}^N (x_i - \bar{x})^2}{N-1} } \] In this equation, \(\textstyle\{x_1,\,x_2,\,\ldots,\,x_N\}\) are the observed values of the sample items, \(\textstyle\bar{x}\) is the mean value of these observations, and N is the number of observations in the sample.
Let’s just concetrate on conditions with interference. Is there a significant difference between Reading and Naming?
ggarrange(RI, NI, ncol = 2)
The data from the two conditions look different. But are they really? Put differently: if we model these data as being drawn from two different distributions, does this two-distribution model better describe the data than the alternative, that is, that all the data points are simply drawn from the same distribution?
If we look at the distribution of all the Reading with Interference and Naming with Interference data points, it looks like this:
library(ggpubr)
df_long <- gather(df, key = "Condition", value = "Time", -Year)
df_int <- subset(df_long, Condition == "Reading_Int" | Condition == "Naming_Int")
df_int$Condition <- as.factor(df_int$Condition)
p1 <- ggplot(df_int, aes(Time)) +
geom_density() +
theme_classic() +
labs(title = "Single distribution model")
p1
How would these same data look, if we modelled them as coming from two different distributions?
# make individual plots for different conditions
p2 <- ggplot(df_int, aes(Time, fill = Condition)) +
geom_density(alpha = 0.5) +
theme_classic() +
labs(title = "Two-distribution model")
p2
There is some overlap in data, but the two distributions do appear to be distinct. It looks as if the 2-condition model does fit the data relatively well.
ggarrange(p1, p2, ncol = 2)
This comparison of distributions can be formalized as the “t-test”. In this case, we use a paired t-test, since the same participants were tested in both conditions. Other t-tests include the independent samples t-test for testing e.g. between two different groups of participants, and the single-sample t-test, for testing whether a sample mean is significantly different from a specified number, e.g. different than zero.
library(pander)
Reading <- subset(df_int$Time, df_int$Condition == "Reading_Int")
Naming <- subset(df_int$Time, df_int$Condition == "Naming_Int")
t <- t.test(Reading, Naming, paired = TRUE)
pander(t)
Test statistic | df | P value | Alternative hypothesis |
---|---|---|---|
-16.77 | 103 | 3.337e-31 * * * | two.sided |
mean of the differences |
---|
-3.837 |
We should be able to take random samples from the same overall distribution and see the same pattern most of the time. Here are 10 random re-samples of 100 data points each from total pool of data.
set.seed(42)
for (i in 1:10) {
print(paste0("Sample ", i))
d <- df_int[sample(nrow(df_int), 100), ]
print(ggplot(d, aes(Time, fill = Condition)) +
geom_density(alpha = 0.5) +
theme_classic())
}
## [1] "Sample 1"
## [1] "Sample 2"
## [1] "Sample 3"
## [1] "Sample 4"
## [1] "Sample 5"
## [1] "Sample 6"
## [1] "Sample 7"
## [1] "Sample 8"
## [1] "Sample 9"
## [1] "Sample 10"
# make individual plots for different conditions
df_long$Condition <- as.factor(df_long$Condition)
df_long <- subset(df_long, Condition != "points")
p1 <- ggplot(df_long, aes(Time)) +
geom_density() +
theme_classic() +
labs(title = "All Data")
p2 <- ggplot(df_long, aes(Time, fill = Condition)) +
geom_density(alpha = 0.5) +
theme_classic() +
labs(title = "Data Modelled by Condition")
ggarrange(p1, p2, ncol = 1, nrow = 2)
# Find mean of all data points
GrandMean <- mean(df_long$Time)
# Find means for each condition
meanNI <- mean(df$Naming_Int)
meanRI <- mean(df$Reading_Int)
meanNNI <- mean(df$Naming_NoInt)
meanRNI <- mean(df$Reading_NoInt)
# Get x-axis values for each data point, grouped by condition
xNI <- subset(df_long$measurements, df_long$Condition == "Naming_Int")
xRI <- subset(df_long$measurements, df_long$Condition == "Reading_Int")
xNNI <- subset(df_long$measurements, df_long$Condition == "Naming_NoInt")
xRNI <- subset(df_long$measurements, df_long$Condition == "Reading_NoInt")
#yNI <- subset(df_long$Time, df_long$Condition == "Naming_Int")
#yRI <- subset(df_long$Time, df_long$Condition == "Reading_Int")
#yNNI <- subset(df_long$Time, df_long$Condition == "Naming_NoInt")
#yRNI <- subset(df_long$Time, df_long$Condition == "Reading_NoInt")
# Make a vector of enpoints for the lines from residuals to the model means
model_means <- c(rep(meanRNI, length(xRNI)), rep(meanNI, length(xNI)), rep(meanNNI, length(xNNI)), rep(meanRI, length(xRI)))
# Plot the total sums of squares
p_TSS <- ggplot(df_long, aes(x = measurements, y = Time, color = Condition)) +
geom_point() +
geom_segment(aes(xend = measurements, yend = GrandMean), alpha = 0.2) +
geom_hline(yintercept = GrandMean, linetype="dashed") +
ylim(0,18) +
theme_classic() +
theme(legend.position="none") +
labs(title = "Total Sums of Squares")
# Plot the residual sums of squares
p_RSS <- ggplot(df_long, aes(x = measurements, y = Time, color = Condition)) +
geom_point() +
geom_segment(aes(xend=measurements, yend=model_means), alpha = 0.2) +
geom_segment(aes(x=min(xNI),xend=max(xNI),y= meanNI,yend=meanNI),colour="black") +
geom_segment(aes(x=min(xRI),xend=max(xRI),y= meanRI,yend=meanRI),colour="black") +
geom_segment(aes(x=min(xNNI),xend=max(xNNI),y= meanNNI,yend=meanNNI),colour="black") +
geom_segment(aes(x=min(xRNI),xend=max(xRNI),y= meanRNI,yend=meanRNI),colour="black") +
ylim(0,18) +
theme_classic() +
theme(legend.position="none") +
labs(title = "Residual Sums of Squares")
# Plot the model sums of squares
p_MSS <- ggplot(df_long, aes(x = measurements, y = Time, color = Condition)) +
geom_segment(aes(x=min(xNI),xend=max(xNI),y= meanNI,yend=meanNI),colour="black") +
geom_segment(aes(x=min(xRI),xend=max(xRI),y= meanRI,yend=meanRI),colour="black") +
geom_segment(aes(x=min(xNNI),xend=max(xNNI),y= meanNNI,yend=meanNNI),colour="black") +
geom_segment(aes(x=min(xRNI),xend=max(xRNI),y= meanRNI,yend=meanRNI),colour="black") +
geom_hline(yintercept = GrandMean, linetype="dashed") +
geom_segment(aes(x=median(xNI), xend = median(xNI),y = meanNI, yend = GrandMean)) +
geom_segment(aes(x=median(xRI), xend = median(xRI),y = meanRI, yend = GrandMean)) +
geom_segment(aes(x=median(xNNI), xend = median(xNNI),y = meanNNI, yend = GrandMean)) +
geom_segment(aes(x=median(xRNI), xend = median(xRNI),y = meanRNI, yend = GrandMean)) +
ylim(0,18) +
theme_classic() +
theme(legend.position="none") +
labs(title = "Model Sums of Squares")
# Make a three-panel plot
ggarrange(p_TSS, p_RSS, p_MSS, ncol = 3)
The F-value, the critical statistic in the table below, is the “Mean Square” of the model, dvided by the “Mean Square” of the residuals: \(F=\frac{MS_{model}}{MS_{residual}}\)
# calculate ANOVA
library(lmerTest)
# Define our model. Here, we are predicting Time as modeled by Task, Interference, and the interaction of Task and Interference. We are modelling ID as a "random effect".
mod <- lmer(Time ~ Task + Interference + Task*Interference + (1|ID), data=df_long)
ans <- anova(mod)
# Display the ANOVA results in a nice table
library(pander)
pander(ans)
Sum Sq | Mean Sq | NumDF | DenDF | F value | Pr(>F) | |
---|---|---|---|---|---|---|
Task | 693.9 | 693.9 | 1 | 309 | 476.8 | 1.378e-64 |
Interference | 561.3 | 561.3 | 1 | 309 | 385.6 | 2.7e-56 |
Task:Interference | 163.5 | 163.5 | 1 | 309 | 112.4 | 1.35e-22 |
# Print text that we can copy and paste into our report
library(psycho)
res <- analyze(ans)
print(res)
## - The effect of Task is significant (F(1, 309) = 476.77, p < .001).
## - The effect of Interference is significant (F(1, 309) = 385.63, p < .001).
## - The interaction between Task and Interference is significant (F(1, 309) = 112.36, p < .001).
# Display Estimates for Conditions
library(psycho)
library(pander)
mod_means <- get_means(mod)
pander(mod_means)
Task | Interference | Mean | SE | df | CI_lower | CI_higher |
---|---|---|---|---|---|---|
Naming | Interference | 9.049 | 0.1401 | 330.3 | 8.773 | 9.324 |
Reading | Interference | 5.212 | 0.1401 | 330.3 | 4.936 | 5.487 |
Naming | No_Interference | 5.472 | 0.1401 | 330.3 | 5.196 | 5.747 |
Reading | No_Interference | 4.143 | 0.1401 | 330.3 | 3.867 | 4.418 |
# Calculate post-hoc contrasts (Tukey)
post_hoc <- get_contrasts(mod)
pander(post_hoc[,c(1,3:6)])
Contrast | SE | df | t | p |
---|---|---|---|---|
Naming,Interference - Reading,Interference | 0.1673 | 309 | 22.93 | 1.011e-12 |
Naming,Interference - Naming,No_Interference | 0.1673 | 309 | 21.38 | 1.011e-12 |
Naming,Interference - Reading,No_Interference | 0.1673 | 309 | 29.33 | 1.011e-12 |
Reading,Interference - Naming,No_Interference | 0.1673 | 309 | -1.554 | 0.4066 |
Reading,Interference - Reading,No_Interference | 0.1673 | 309 | 6.391 | 3.646e-09 |
Naming,No_Interference - Reading,No_Interference | 0.1673 | 309 | 7.945 | 1.27e-12 |