Read data

load("nhanes_bmi.RData")
nhanes_bmi

Check covariates balance

n <- table(nhanes_bmi$School_meal)
z <- nhanes_bmi$School_meal
y <- nhanes_bmi$BMI
x <- as.matrix(nhanes_bmi[, -c(1 , 2)])
data_stat <- data.frame(t(apply(x, 2, function(x) c(mean(x[z==0]), sd(x[z==0]), mean(x[z==1]), sd(x[z==1])))))
colnames(data_stat) <- c("Mean control", "S.D. control", "Mean treated", "S.D. treated")
data_stat$t_stat <- (data_stat[, 3] - data_stat[, 1])/sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
signif(data_stat, 2)

Visualize coviariates balance

library("ggplot2")
mm <- data_stat[, 3] - data_stat[, 1]
sd_mm <- sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
temp <- data.frame(x = rownames(data_stat), y = mm, low = mm - 1.96 * sd_mm, 
                   high = mm + 1.96 * sd_mm)
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) + geom_errorbar(aes(ymin = low, ymax = high))+ xlab("") + ylab("") + theme_classic() + theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 

Estimate propensity score

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pscore <- glm (z ~ x , family = binomial)$fitted.values

temp <- data.frame(group = factor(z), pscore = pscore)
p <- ggplot(temp, aes(x=pscore, fill=group)) + geom_histogram(alpha=0.6, position = 'identity')
p
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The range of the estimated propensity score is \([0.134, 0.963]\). There are not very extreme propensity scores so trimming is optional.

Check covariate balancing after stratification K = 5

NeymanSRE <- function(W, Y, strata.labels) {
  n <- length(W)
  groups <- unique(strata.labels)
  ests <- sapply(groups, function(gg) {
   wt <- sum(strata.labels == gg)/n
   est <- mean(Y[strata.labels == gg & W == 1]) - mean(Y[strata.labels == gg & W == 0])
   est.var <- var(Y[strata.labels == gg & W == 1])/sum(strata.labels == gg & W == 1) + 
     var(Y[strata.labels == gg & W == 0])/sum(strata.labels == gg & W == 0)
   return(c(wt, est, est.var))
 })
# print(ests)
 neyman.est <- sum(ests[1, ] * ests[2, ])
 neyman.var <- sum(ests[1, ]^2 * ests[3, ])
 return(c(est = neyman.est, se = sqrt(neyman.var)))
}

nn <- 5
q.pscore <- quantile(pscore , (1:( nn -1)) / nn)
ps.strata <- cut(pscore, breaks = c(0 , q.pscore ,1), labels = 1:nn)
balance_check <- apply(x, 2, function(v) NeymanSRE(z, v, ps.strata)) 


temp <- data.frame(x = colnames(x), y = balance_check[1, ], 
                   low = balance_check[1, ] - 1.96 * balance_check[2, ], 
                   high = balance_check[1, ] + 1.96 * balance_check[2, ])
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) + geom_errorbar(aes(ymin = low, ymax = high))+ xlab("") + ylab("") + theme_classic() + theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 

Stratification based on propensity score

n.strata <- c(5 , 10 , 20 , 50)
strat.res <- sapply(n.strata , function(nn){
  q.pscore <- quantile(pscore , (1:( nn -1)) / nn)
  ps.strata <- cut(pscore, breaks = c(0 , q.pscore ,1), labels = 1:nn)
  NeymanSRE(z, y, ps.strata)})

colnames(strat.res) <- n.strata
round(strat.res, 3)
##          5     10     20     50
## est -0.116 -0.178 -0.200 -0.265
## se   0.283  0.282  0.279  0.272