1. Read the data

Read the data and remove the two individuals that have dropped out. Focus on the outcome variable SPPB score (short physical performance battery).

data <- read.table("HOMEFOOD.tab", sep = "\t", header = T)
data <- data[!is.na(data$BMI_1), ]

data <- data[, c("group", "sex", "age", "education_2_cat", "smoker", "alcohol", "nr_of_medicines", "BMI_0", "sppb_0", "ICD10", "MMSE_0", "ISNST_0", "sppb_1")]

data
balance_results <- sapply(2:12, function(i) {
  mean_t <- mean(data[data$group == 1, i])
  mean_c <- mean(data[data$group == 0, i])
  sd_t <- sd(data[data$group == 1, i])
  sd_c <- sd(data[data$group == 0, i])
  t_stat <- (mean_t - mean_c) / sqrt(sd_t^2/sum(data$group == 1) + sd_c^2/sum(data$group == 0))
  p_val <- 2 * (1-pnorm(abs(t_stat)))
  return(c(mean_t, sd_t, mean_c, sd_c, t_stat, p_val))
})
balance_results <- t(balance_results)
colnames(balance_results) <- c("treat_mean", "treat_sd", "control_mean", "control_sd", "t_stat", "p_val")
rownames(balance_results) <- colnames(data)[2:12]
signif(balance_results, 2)
##                 treat_mean treat_sd control_mean control_sd t_stat p_val
## sex                   1.30     0.45         1.50       0.50 -2.100 0.040
## age                  83.00     6.70        82.00       6.00  1.200 0.230
## education_2_cat       0.69     0.47         0.67       0.47  0.210 0.830
## smoker                2.00     0.19         1.90       0.30  1.200 0.240
## alcohol               1.60     0.49         1.50       0.50  0.790 0.430
## nr_of_medicines      12.00     4.60        12.00       4.20 -0.640 0.520
## BMI_0                28.00     6.50        27.00       5.30  1.300 0.200
## sppb_0                2.50     1.80         2.40       2.00  0.310 0.760
## ICD10                10.00     4.90        10.00       3.80 -0.089 0.930
## MMSE_0               26.00     2.70        26.00       2.90  0.420 0.670
## ISNST_0               5.10     1.70         4.50       1.30  1.900 0.052

2. Neyman’s approach before and after post-stratification

tau_hat <- mean(data$sppb_1[data$group == 1]) - mean(data$sppb_1[data$group == 0])

st <- sd(data$sppb_1[data$group == 1])
sc <- sd(data$sppb_1[data$group == 0])
varhat_tauhat <- st^2/sum(data$group == 1) + sc^2/sum(data$group == 0)

pval <- 2 * pnorm(abs(tau_hat)/sqrt(varhat_tauhat), lower.tail = F)
result <- c(tau_hat, sqrt(varhat_tauhat), tau_hat- 1.96 * sqrt(varhat_tauhat), tau_hat + 1.96 * sqrt(varhat_tauhat), pval)
names(result) <- c("est", "sd", "CI lower", "CI upper", "p-value")
signif(result, 2)
##      est       sd CI lower CI upper  p-value 
##    1.000    0.420    0.220    1.900    0.013
## CI from Neyman

## For the female strata
tau_hat_F <- mean(data$sppb_1[data$group == 1 & data$sex == 1]) - mean(data$sppb_1[data$group == 0 & data$sex == 1])

st_F <- sd(data$sppb_1[data$group == 1 & data$sex == 1])
sc_F <- sd(data$sppb_1[data$group == 0 & data$sex == 1])
varhat_tauhat_F <- st_F^2/sum(data$group == 1 & data$sex == 1) + sc_F^2/sum(data$group == 0 & data$sex == 1)

## For the male strata
tau_hat_M <- mean(data$sppb_1[data$group == 1 & data$sex == 2]) - mean(data$sppb_1[data$group == 0 & data$sex == 2])

st_M <- sd(data$sppb_1[data$group == 1 & data$sex == 2])
sc_M <- sd(data$sppb_1[data$group == 0 & data$sex == 2])
varhat_tauhat_M <- st_M^2/sum(data$group == 1 & data$sex == 2) + sc_M^2/sum(data$group == 0 & data$sex == 2)

nF <- sum(data$sex == 1)
nM <- sum(data$sex == 2)
tau_hat <- nF/(nF+nM) * tau_hat_F + nM/(nF+nM) * tau_hat_M
sd_tau_hat <- sqrt((nF/(nF+nM))^2 * varhat_tauhat_F + (nM/(nF+nM))^2 * varhat_tauhat_M)
  
## 95% CI
result <- cbind(c(tau_hat_F, sqrt(varhat_tauhat_F)), c(tau_hat_M, sqrt(varhat_tauhat_M)), c(tau_hat, sd_tau_hat))
colnames(result) <- c("Female", "Male", "Combined")
rownames(result) <- c("est", "sd")
signif(result, 2)
##     Female Male Combined
## est   0.65 1.90     1.10
## sd    0.53 0.72     0.43
print(paste0("95% CI for the ATE: [", signif(tau_hat- 1.96 * sd_tau_hat, 2), ", ", 
             signif(tau_hat + 1.96 * sd_tau_hat, 3), "]"))
## [1] "95% CI for the ATE: [0.26, 1.93]"
data$sex_std <- data$sex - mean(data$sex)
lm.result <- lm(sppb_1 ~ group + sex_std + group:sex_std, data = data)
library(sandwich)
summary(lm.result)
## 
## Call:
## lm(formula = sppb_1 ~ group + sex_std + group:sex_std, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5789 -1.6250  0.4211  1.4211  5.0714 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.8177     0.3017   9.340 2.77e-15 ***
## group           1.0978     0.4276   2.567   0.0117 *  
## sex_std        -0.3036     0.5942  -0.511   0.6105    
## group:sex_std   1.2246     0.8939   1.370   0.1737    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.136 on 100 degrees of freedom
## Multiple R-squared:  0.07671,    Adjusted R-squared:  0.04902 
## F-statistic:  2.77 on 3 and 100 DF,  p-value: 0.04558
sd <- sqrt(vcovHC(lm.result, type = "HC2")[2, 2])
est <- summary(lm.result)$coefficients["group", "Estimate"]
pval <- 2 * pnorm(abs(est)/sd, lower.tail = F)
result <- c(est, sd, est - 1.96*sd, est + 1.96*sd, pval)
names(result) <- c("est", "sd", "CI lower", "CI upper", "p-value")
signif(result, 2)
##      est       sd CI lower CI upper  p-value 
##   1.1000   0.4300   0.2600   1.9000   0.0099

We get the same CI as using Neyman’s approach with post-stratification

data$sppb_0_std <- data$sppb_0 - mean(data$sppb_0)
lm.result <- lm(sppb_1 ~ group + sex_std + sppb_0_std + group:(sex_std+sppb_0_std), data = data)
library(sandwich)
summary(lm.result)
## 
## Call:
## lm(formula = sppb_1 ~ group + sex_std + sppb_0_std + group:(sex_std + 
##     sppb_0_std), data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5943 -1.2032  0.0681  1.4106  3.8035 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.82523    0.26399  10.702  < 2e-16 ***
## group             1.05983    0.37437   2.831  0.00563 ** 
## sex_std           0.01903    0.52398   0.036  0.97110    
## sppb_0_std        0.66910    0.13444   4.977 2.76e-06 ***
## group:sex_std     0.82426    0.78539   1.050  0.29653    
## group:sppb_0_std -0.27133    0.19574  -1.386  0.16884    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.869 on 98 degrees of freedom
## Multiple R-squared:  0.3071, Adjusted R-squared:  0.2718 
## F-statistic: 8.687 on 5 and 98 DF,  p-value: 7.596e-07
sd <- sqrt(vcovHC(lm.result, type = "HC2")[2, 2])
est <- summary(lm.result)$coefficients["group", "Estimate"]
pval <- 2 * pnorm(abs(est)/sd, lower.tail = F)
result <- c(est, sd, est - 1.96*sd, est + 1.96*sd, pval)
names(result) <- c("est", "sd", "CI lower", "CI upper", "p-value")
signif(result, 2)
##      est       sd CI lower CI upper  p-value 
##   1.1000   0.3700   0.3300   1.8000   0.0044
data$Y0 <- data$Y1 <- data$sppb_1


getPvalueMC <- function(data, tau, M = 10000) {
  tau_obs <- mean(data$sppb_1[data$group==1]) - mean(data$sppb_1[data$group==0]) - tau
  
  data$Y0[data$group == 1] <-  data$sppb_1[data$group == 1] - tau
  data$Y1[data$group == 0] <- data$sppb_1[data$group == 0] + tau

  ## randomly sample the treatment assignment vector following the complete randomization mechanism
  
  set.seed(0)
  tau_rand <- sapply(1:M, function(i) {
    w <- sample(data$group)
    return(mean(data$Y1[w==1]) - mean(data$Y0[w==0]) - tau)
  })
  hist(tau_rand, breaks = 10, 
       main="Distribution under sharp Null of causal effect = 0", xlab="Test statistics")
  abline(v = tau_obs, col = "red")
  p = mean(abs(tau_rand) >= abs(tau_obs))
  return(p)
}

paste("Fisher's p-value:", getPvalueMC(data, 0, 10000))

## [1] "Fisher's p-value: 0.0182"
strata <- factor(data$sex)


getPvalueMCStrata <- function(data, tau, strata, M = 10000) {
  wts <- prop.table(table(strata))
  levels <- names(wts)
  
  tau_obs_strata <- sapply(levels, 
                           function(k) {mean(data$sppb_1[data$group==1 & strata == k]) -
                               mean(data$sppb_1[data$group==0 & strata == k]) - tau})
  tau_obs <- sum(wts * tau_obs_strata)
  
  
  data$Y0[data$group == 1] <-  data$sppb_1[data$group == 1] - tau
  data$Y1[data$group == 0] <- data$sppb_1[data$group == 0] + tau

  ## randomly sample the treatment assignment vector following the complete randomization mechanism
  
  set.seed(0)
  tau_rand <- sapply(1:M, function(i) {
    val_strata <- sapply(levels, function(k) {
      wk <- sample(data[strata == k, ]$group)
      return(mean(data[strata == k, ]$Y1[wk == 1]) - mean(data[strata == k, ]$Y0[wk == 0]) - tau)
      })
    val <- sum(wts * val_strata)
    return(val)
  })
  hist(tau_rand, breaks = 10, 
       main="Distribution under sharp Null of causal effect = 0", xlab="Test statistics")
  abline(v = tau_obs, col = "red")
  p = mean(abs(tau_rand) >= abs(tau_obs))
  return(p)
}

paste("Fisher's p-value:", getPvalueMCStrata(data,0, strata, 10000))

## [1] "Fisher's p-value: 0.0109"