Background:
A school district aims to assess the effectiveness of a new teaching method designed to improve students’ mathematics performance. They conduct a completely randomized experiment involving 200 high school students, all in the same grade level. Each student is randomly assigned to one of two groups:
Treatment Group: Students who receive instruction using the new teaching method.
Control Group: Students who continue with the standard teaching method.
Data Collected:
Treatment Indicator: A binary variable where 1 indicates the student received the new teaching method, and 0 indicates the standard method.
Prior Achievement Level: A categorical variable indicating the prior math achievement level (Basic, Proficient, Advanced) of each student.
Post-Test Scores (outcome): End-of-term math test scores for each student.
We used asynthetic dataset generated by R:
set.seed(123)
# Number of students
n <- 200
# Generate Prior Achievement Levels
Achievement_levels <- c("Basic", "Proficient", "Advanced")
Achievement <- sample(Achievement_levels, n, replace = TRUE, prob = c(0.3, 0.5, 0.2))
# Convert Achievement to factor with specified order
Achievement <- factor(Achievement, levels = c("Basic", "Proficient", "Advanced"))
# Randomly assign students to Treatment (1) or Control (0) groups
Treatment <- rbinom(n, 1, 0.5)
# Generate random error term (assuming normal distribution)
epsilon <- rnorm(n, mean = 0, sd = 5)
# Calculate potential post-test scores without the treatment
PostTest_0 <- 60 + ifelse(Achievement == "Proficient", 10, 0) +
ifelse(Achievement == "Advanced", 20, 0) +
epsilon
## individual treatment effect, allowing heterogeneity across individuals and interaction with the achievement levels
Treatment_effect <- 5 + ifelse(Achievement == "Proficient", 10, 0) + ifelse(Achievement == "Advanced", -2, 0) + rnorm(n, mean = 0, sd = 5)
## observed outcome: Y = Y(0) + \tau_i W_i
PostTest <- PostTest_0 + Treatment_effect * Treatment
# Create a data frame for the observed data
data <- data.frame(
Y = PostTest,
W = Treatment,
X = Achievement
)
print(paste("True average treatment effect:", round(mean(Treatment_effect), digits=2)))
## [1] "True average treatment effect: 9.94"
## Visualize the data
data$Treatment_jitter <- jitter(data$W, amount = 0.1)
# Load ggplot2 for plotting
library(ggplot2)
# Create the scatter plot
ggplot(data, aes(x = Treatment_jitter, y = Y, color = X)) +
geom_point(alpha = 0.7, size = 2) +
labs(
title = "Treatment vs. Control Colored by Prior Achievement",
x = "Treatment (0 = Control, 1 = Treatment)",
y = "Post-Test Score",
color = "Achievement Level"
) +
theme_minimal() +
scale_x_continuous(breaks = c(0, 1), labels = c("Control", "Treatment")) +
theme(
legend.position = "right",
plot.title = element_text(hjust = 0.5)
)
lm.result1 <- lm(Y ~ W, data = data)
## with the homoscedastic error assumption
sd <- summary(lm.result1)$coefficients["W", "Std. Error"]
est <- summary(lm.result1)$coefficients["W", "Estimate"]
## without the homoscedastic error assumption
library(sandwich)
## with the HC2 adjustment
sd2 <- sqrt(vcovHC(lm.result1, type = "HC2")[2, 2])
result1 <- c(est, sd, sd2)
names(result1) <- c("estimate", "Homoscedastic error", "Sandwich with HC2")
print(result1)
## estimate Homoscedastic error Sandwich with HC2
## 8.325908 1.339229 1.373047
We incorporate pre-treatment achievement level in the linear regression and still target the average treatment effect.
## If we assume the conditional average treatment effect of the new drug to be the same across the pre-treatment glucose levels
lm.result2 <- lm(Y ~ W + X, data = data)
summary(lm.result2)
##
## Call:
## lm(formula = Y ~ W + X, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.3390 -4.6270 0.1561 4.2207 17.1226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.7926 0.9510 61.82 <2e-16 ***
## W 9.6559 0.9132 10.57 <2e-16 ***
## XProficient 13.2170 1.0397 12.71 <2e-16 ***
## XAdvanced 18.4742 1.3454 13.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.396 on 196 degrees of freedom
## Multiple R-squared: 0.6198, Adjusted R-squared: 0.614
## F-statistic: 106.5 on 3 and 196 DF, p-value: < 2.2e-16
## with the homoscedastic error assumption
sd <- summary(lm.result2)$coefficients["W", "Std. Error"]
est <- summary(lm.result2)$coefficients["W", "Estimate"]
## without the homoscedastic error assumption
## with the HC2 adjustment
sd2 <- sqrt(vcovHC(lm.result2, type = "HC2")[2, 2])
result2 <- c(est, sd, sd2)
names(result2) <- c("estimate", "Homoscedastic error", "Sandwich with HC2")
print(result2)
## estimate Homoscedastic error Sandwich with HC2
## 9.6558557 0.9132314 0.9403229
We see that add pre-treatment covariates can indeed increase our estimation accuracy.
data$X1 <- data$X == "Proficient"
data$X2 <- data$X == "Advanced"
data$X1 <- data$X1 - mean(data$X1)
data$X2 <- data$X2 - mean(data$X2)
lm.result3 <- lm(Y ~ W + X1 + X2 + W:X1 + W:X2, data = data)
summary(lm.result3)
##
## Call:
## lm(formula = Y ~ W + X1 + X2 + W:X1 + W:X2, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.3155 -3.8493 -0.1029 4.1444 14.7565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.0132 0.5793 119.131 < 2e-16 ***
## W 9.5801 0.8559 11.193 < 2e-16 ***
## X1 8.7434 1.3747 6.360 1.42e-09 ***
## X2 17.8872 1.6531 10.821 < 2e-16 ***
## W:X1 9.0929 1.9481 4.668 5.68e-06 ***
## W:X2 -0.5227 2.5789 -0.203 0.84
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.99 on 194 degrees of freedom
## Multiple R-squared: 0.6698, Adjusted R-squared: 0.6613
## F-statistic: 78.72 on 5 and 194 DF, p-value: < 2.2e-16
## with the homoscedastic error assumption
sd <- summary(lm.result3)$coefficients["W", "Std. Error"]
est <- summary(lm.result3)$coefficients["W", "Estimate"]
## without the homoscedastic error assumption
## with the HC2 adjustment
sd2 <- sqrt(vcovHC(lm.result3, type = "HC2")[2, 2])
result3 <- c(est, sd, sd2)
names(result3) <- c("estimate", "Homoscedastic error", "Sandwich with HC2")
print(result3)
## estimate Homoscedastic error Sandwich with HC2
## 9.5800660 0.8558842 0.8781174
result <- data.frame(rbind(result1, result2, result3))
result$label <- factor(c("no covariate", "linear adjustment", "linear adjustment with interactions"),
levels = c("linear adjustment with interactions", "linear adjustment", "no covariate"))
# Confidence interval level
alpha <- 0.05
z <- qnorm(1 - alpha / 2)
# Calculate confidence intervals
result$CI_Lower <- result[, 1] - z * result[, 3]
result$CI_Upper <- result[, 1] + z * result[, 3]
true_value <- mean(Treatment_effect) # You can set this to the actual true value
# Create the plot
ggplot(result, aes(y = label, x = estimate)) +
geom_point(size = 3) + # Add points for estimates
geom_errorbarh(aes(xmin = CI_Lower, xmax = CI_Upper), height = 0.2, size = 1) + # Horizontal error bars for CIs
geom_vline(xintercept = true_value, color = "red", linetype = "dashed", size = 1) + # Add red vertical line for true value
labs(title = "Confidence Intervals of ATE", x = "Estimate", y = "") +
theme_minimal() + # Use a clean theme
theme(axis.text.y = element_text(size = 12), # Customize text size for labels
plot.title = element_text(hjust = 0.5, size = 14)) # Center the title
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.