Read the data and remove the two individuals that have dropped out. Focus on the outcome variable SPPB score (short physical performance battery).
data <- read.table("HOMEFOOD.tab", sep = "\t", header = T)
data <- data[!is.na(data$BMI_1), ]
data <- data[, c("group", "sex", "age", "education_2_cat", "smoker", "alcohol", "nr_of_medicines", "BMI_0", "sppb_0", "ICD10", "MMSE_0", "ISNST_0", "sppb_1")]
data
balance_results <- sapply(2:12, function(i) {
mean_t <- mean(data[data$group == 1, i])
mean_c <- mean(data[data$group == 0, i])
sd_t <- sd(data[data$group == 1, i])
sd_c <- sd(data[data$group == 0, i])
t_stat <- (mean_t - mean_c) / sqrt(sd_t^2/sum(data$group == 1) + sd_c^2/sum(data$group == 0))
p_val <- 2 * (1-pnorm(abs(t_stat)))
return(c(mean_t, sd_t, mean_c, sd_c, t_stat, p_val))
})
balance_results <- t(balance_results)
colnames(balance_results) <- c("treat_mean", "treat_sd", "control_mean", "control_sd", "t_stat", "p_val")
rownames(balance_results) <- colnames(data)[2:12]
signif(balance_results, 2)
## treat_mean treat_sd control_mean control_sd t_stat p_val
## sex 1.30 0.45 1.50 0.50 -2.100 0.040
## age 83.00 6.70 82.00 6.00 1.200 0.230
## education_2_cat 0.69 0.47 0.67 0.47 0.210 0.830
## smoker 2.00 0.19 1.90 0.30 1.200 0.240
## alcohol 1.60 0.49 1.50 0.50 0.790 0.430
## nr_of_medicines 12.00 4.60 12.00 4.20 -0.640 0.520
## BMI_0 28.00 6.50 27.00 5.30 1.300 0.200
## sppb_0 2.50 1.80 2.40 2.00 0.310 0.760
## ICD10 10.00 4.90 10.00 3.80 -0.089 0.930
## MMSE_0 26.00 2.70 26.00 2.90 0.420 0.670
## ISNST_0 5.10 1.70 4.50 1.30 1.900 0.052
tau_hat <- mean(data$sppb_1[data$group == 1]) - mean(data$sppb_1[data$group == 0])
st <- sd(data$sppb_1[data$group == 1])
sc <- sd(data$sppb_1[data$group == 0])
varhat_tauhat <- st^2/sum(data$group == 1) + sc^2/sum(data$group == 0)
pval <- 2 * pnorm(abs(tau_hat)/sqrt(varhat_tauhat), lower.tail = F)
result <- c(tau_hat, sqrt(varhat_tauhat), tau_hat- 1.96 * sqrt(varhat_tauhat), tau_hat + 1.96 * sqrt(varhat_tauhat), pval)
names(result) <- c("est", "sd", "CI lower", "CI upper", "p-value")
signif(result, 2)
## est sd CI lower CI upper p-value
## 1.000 0.420 0.220 1.900 0.013
## CI from Neyman
## For the female strata
tau_hat_F <- mean(data$sppb_1[data$group == 1 & data$sex == 1]) - mean(data$sppb_1[data$group == 0 & data$sex == 1])
st_F <- sd(data$sppb_1[data$group == 1 & data$sex == 1])
sc_F <- sd(data$sppb_1[data$group == 0 & data$sex == 1])
varhat_tauhat_F <- st_F^2/sum(data$group == 1 & data$sex == 1) + sc_F^2/sum(data$group == 0 & data$sex == 1)
## For the male strata
tau_hat_M <- mean(data$sppb_1[data$group == 1 & data$sex == 2]) - mean(data$sppb_1[data$group == 0 & data$sex == 2])
st_M <- sd(data$sppb_1[data$group == 1 & data$sex == 2])
sc_M <- sd(data$sppb_1[data$group == 0 & data$sex == 2])
varhat_tauhat_M <- st_M^2/sum(data$group == 1 & data$sex == 2) + sc_M^2/sum(data$group == 0 & data$sex == 2)
nF <- sum(data$sex == 1)
nM <- sum(data$sex == 2)
tau_hat <- nF/(nF+nM) * tau_hat_F + nM/(nF+nM) * tau_hat_M
sd_tau_hat <- sqrt((nF/(nF+nM))^2 * varhat_tauhat_F + (nM/(nF+nM))^2 * varhat_tauhat_M)
## 95% CI
result <- cbind(c(tau_hat_F, sqrt(varhat_tauhat_F)), c(tau_hat_M, sqrt(varhat_tauhat_M)), c(tau_hat, sd_tau_hat))
colnames(result) <- c("Female", "Male", "Combined")
rownames(result) <- c("est", "sd")
signif(result, 2)
## Female Male Combined
## est 0.65 1.90 1.10
## sd 0.53 0.72 0.43
print(paste0("95% CI for the ATE: [", signif(tau_hat- 1.96 * sd_tau_hat, 2), ", ",
signif(tau_hat + 1.96 * sd_tau_hat, 3), "]"))
## [1] "95% CI for the ATE: [0.26, 1.93]"
data$sex_std <- data$sex - mean(data$sex)
lm.result <- lm(sppb_1 ~ group + sex_std + group:sex_std, data = data)
library(sandwich)
summary(lm.result)
##
## Call:
## lm(formula = sppb_1 ~ group + sex_std + group:sex_std, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5789 -1.6250 0.4211 1.4211 5.0714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8177 0.3017 9.340 2.77e-15 ***
## group 1.0978 0.4276 2.567 0.0117 *
## sex_std -0.3036 0.5942 -0.511 0.6105
## group:sex_std 1.2246 0.8939 1.370 0.1737
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.136 on 100 degrees of freedom
## Multiple R-squared: 0.07671, Adjusted R-squared: 0.04902
## F-statistic: 2.77 on 3 and 100 DF, p-value: 0.04558
sd <- sqrt(vcovHC(lm.result, type = "HC2")[2, 2])
est <- summary(lm.result)$coefficients["group", "Estimate"]
pval <- 2 * pnorm(abs(est)/sd, lower.tail = F)
result <- c(est, sd, est - 1.96*sd, est + 1.96*sd, pval)
names(result) <- c("est", "sd", "CI lower", "CI upper", "p-value")
signif(result, 2)
## est sd CI lower CI upper p-value
## 1.1000 0.4300 0.2600 1.9000 0.0099
We get the same CI as using Neyman’s approach with post-stratification
data$sppb_0_std <- data$sppb_0 - mean(data$sppb_0)
lm.result <- lm(sppb_1 ~ group + sex_std + sppb_0_std + group:(sex_std+sppb_0_std), data = data)
library(sandwich)
summary(lm.result)
##
## Call:
## lm(formula = sppb_1 ~ group + sex_std + sppb_0_std + group:(sex_std +
## sppb_0_std), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5943 -1.2032 0.0681 1.4106 3.8035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.82523 0.26399 10.702 < 2e-16 ***
## group 1.05983 0.37437 2.831 0.00563 **
## sex_std 0.01903 0.52398 0.036 0.97110
## sppb_0_std 0.66910 0.13444 4.977 2.76e-06 ***
## group:sex_std 0.82426 0.78539 1.050 0.29653
## group:sppb_0_std -0.27133 0.19574 -1.386 0.16884
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.869 on 98 degrees of freedom
## Multiple R-squared: 0.3071, Adjusted R-squared: 0.2718
## F-statistic: 8.687 on 5 and 98 DF, p-value: 7.596e-07
sd <- sqrt(vcovHC(lm.result, type = "HC2")[2, 2])
est <- summary(lm.result)$coefficients["group", "Estimate"]
pval <- 2 * pnorm(abs(est)/sd, lower.tail = F)
result <- c(est, sd, est - 1.96*sd, est + 1.96*sd, pval)
names(result) <- c("est", "sd", "CI lower", "CI upper", "p-value")
signif(result, 2)
## est sd CI lower CI upper p-value
## 1.1000 0.3700 0.3300 1.8000 0.0044
data$Y0 <- data$Y1 <- data$sppb_1
getPvalueMC <- function(data, tau, M = 10000) {
tau_obs <- mean(data$sppb_1[data$group==1]) - mean(data$sppb_1[data$group==0]) - tau
data$Y0[data$group == 1] <- data$sppb_1[data$group == 1] - tau
data$Y1[data$group == 0] <- data$sppb_1[data$group == 0] + tau
## randomly sample the treatment assignment vector following the complete randomization mechanism
set.seed(0)
tau_rand <- sapply(1:M, function(i) {
w <- sample(data$group)
return(mean(data$Y1[w==1]) - mean(data$Y0[w==0]) - tau)
})
hist(tau_rand, breaks = 10,
main="Distribution under sharp Null of causal effect = 0", xlab="Test statistics")
abline(v = tau_obs, col = "red")
p = mean(abs(tau_rand) >= abs(tau_obs))
return(p)
}
paste("Fisher's p-value:", getPvalueMC(data, 0, 10000))
## [1] "Fisher's p-value: 0.0182"
strata <- factor(data$sex)
getPvalueMCStrata <- function(data, tau, strata, M = 10000) {
wts <- prop.table(table(strata))
levels <- names(wts)
tau_obs_strata <- sapply(levels,
function(k) {mean(data$sppb_1[data$group==1 & strata == k]) -
mean(data$sppb_1[data$group==0 & strata == k]) - tau})
tau_obs <- sum(wts * tau_obs_strata)
data$Y0[data$group == 1] <- data$sppb_1[data$group == 1] - tau
data$Y1[data$group == 0] <- data$sppb_1[data$group == 0] + tau
## randomly sample the treatment assignment vector following the complete randomization mechanism
set.seed(0)
tau_rand <- sapply(1:M, function(i) {
val_strata <- sapply(levels, function(k) {
wk <- sample(data[strata == k, ]$group)
return(mean(data[strata == k, ]$Y1[wk == 1]) - mean(data[strata == k, ]$Y0[wk == 0]) - tau)
})
val <- sum(wts * val_strata)
return(val)
})
hist(tau_rand, breaks = 10,
main="Distribution under sharp Null of causal effect = 0", xlab="Test statistics")
abline(v = tau_obs, col = "red")
p = mean(abs(tau_rand) >= abs(tau_obs))
return(p)
}
paste("Fisher's p-value:", getPvalueMCStrata(data,0, strata, 10000))
## [1] "Fisher's p-value: 0.0109"