We use the Lalonde data that we encountered in HW2 and will encounter again in HW4. Here we focus on the observational data of the job training program. The job training program took place in 1976, and we are interested in its impact on 1978 earnings.

Initial assessment

We obtain the data from a software called MatchIt. This is a subset of the data that you will deal with in HW4.

library("MatchIt")
There were 16 warnings (use warnings() to see them)
data("lalonde")
lalonde
model <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
x <- model.matrix(model)[, -1]
n <- table(lalonde$treat)
z <- lalonde$treat
data_stat <- data.frame(t(apply(x, 2, function(x) c(mean(x[z==0]), sd(x[z==0]), mean(x[z==1]), sd(x[z==1])))))
colnames(data_stat) <- c("Mean control", "S.D. control", "Mean treated", "S.D. treated")
data_stat$t_stat <- (data_stat[, 3] - data_stat[, 1])/sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
signif(data_stat, 2)

We use t statistics here as the mean differences for different variables do not have comparable scales

library("ggplot2")
t.stat <- (data_stat[, 3] - data_stat[, 1])/sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
temp <- data.frame(x = colnames(x), y = t.stat)
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
  geom_hline(yintercept = -1.96, color = "red")+ 
  geom_hline(yintercept = 1.96, color = "red") +
  xlab("") + ylab("t statistics") + theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 

T statistics show that the mean differences of many variables are significantly non-zero.

Stratification based on the estimated propensity score

model <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
summary(model)

Call:
glm(formula = treat ~ ., family = "binomial", data = lalonde[, 
    -9])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7645  -0.4736  -0.2862   0.7508   2.7169  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.663e+00  9.709e-01  -1.713  0.08668 .  
age          1.578e-02  1.358e-02   1.162  0.24521    
educ         1.613e-01  6.513e-02   2.477  0.01325 *  
racehispan  -2.082e+00  3.672e-01  -5.669 1.44e-08 ***
racewhite   -3.065e+00  2.865e-01 -10.699  < 2e-16 ***
married     -8.321e-01  2.903e-01  -2.866  0.00415 ** 
nodegree     7.073e-01  3.377e-01   2.095  0.03620 *  
re74        -7.178e-05  2.875e-05  -2.497  0.01253 *  
re75         5.345e-05  4.635e-05   1.153  0.24884    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 751.49  on 613  degrees of freedom
Residual deviance: 487.84  on 605  degrees of freedom
AIC: 505.84

Number of Fisher Scoring iterations: 5
library(ggplot2)
pscore <- model$fitted.values
temp.data <- data.frame(eps = pscore, treated = as.factor(lalonde$treat))
ggplot(temp.data, 
       aes(x = eps, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + 
  xlab("Estimated propensity score") 

Overlap is extremely poor. Many control units have low propensity score.

rm.idx <- which(pscore < 0.1 | pscore > 0.9)
idx.treated <- which((lalonde$treat == 1) & (pscore > max(pscore[lalonde$treat == 0])))
idx.control <- which((lalonde$treat == 0) & (pscore < min(pscore[lalonde$treat == 1])))
rm.idx <- union(rm.idx, c(idx.treated, idx.control))

lalonde <- lalonde[-rm.idx, ]
pscore <- pscore[-rm.idx]
lps <- predict(model)[-rm.idx]
x <- x[-rm.idx, ]

lalonde

Almost half of the units get trimmed out

Stratify the units into K = 5 equal strata and check covariates balancing

NeymanSRE <- function(W, Y, strata.labels) {
  n <- length(W)
  groups <- unique(strata.labels)
  ests <- sapply(groups, function(gg) {
   wt <- sum(strata.labels == gg)/n
   est <- mean(Y[strata.labels == gg & W == 1]) - mean(Y[strata.labels == gg & W == 0])
   est.var <- var(Y[strata.labels == gg & W == 1])/sum(strata.labels == gg & W == 1) + 
     var(Y[strata.labels == gg & W == 0])/sum(strata.labels == gg & W == 0)
   return(c(wt, est, est.var))
 })
# print(ests)
 neyman.est <- sum(ests[1, ] * ests[2, ])
 neyman.var <- sum(ests[1, ]^2 * ests[3, ])
 return(c(est = neyman.est, se = sqrt(neyman.var), 
          tstat = neyman.est/sqrt(neyman.var)))
}

nn <- 5
q.pscore <- quantile(pscore , (1:( nn -1)) / nn)
ps.strata <- cut(pscore, breaks = c(0 , q.pscore ,1), labels = 1:nn)
balance_check <- apply(x, 2, function(v) NeymanSRE(lalonde$treat, v, ps.strata)) 


temp <- data.frame(x = colnames(x), y = balance_check[3, ])
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
  geom_hline(yintercept = -1.96, color = "red")+ 
  geom_hline(yintercept = 1.96, color = "red") +
  xlab("") + ylab("t statistics") + theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 

Covariates get much better balanced.

## First, create a data frame to store the current grouping information
temp <- data.frame(e = lps, treat = lalonde$treat, b = 1)

t.max <-  1.96
# check whether t. stat is above t.max for first iteration
t.stat <- t.test(x = temp$e[temp$treat == 1], y =  temp$e[temp$treat == 0], var.equal=T)$statistic
condition = t.stat > t.max
# minimum n.t or n.c in each block
size <- 3
size.new <- 20

# continue until all t statistics are below t.max
set.seed(0)
while(condition)
{
  # calculate size in each group
  # we want to not split a block if it is too small
  b <- max(temp$b)
  ignore <- sapply(1:b, function(j) {
     n.t <- sum(temp$treat == 1 & temp$b == j)
     n.c <- sum(temp$treat == 0 & temp$b == j)
     return(n.t < size | n.c < size | (n.t + n.c < size.new * 2))
  })
  
  
  # split unbalanced blocks into more blocks
  split <- which((abs(t.stat) > t.max) & (!ignore))
  
  if(length(split) == 0)
    break

  
  ## we need to keep a current copy of the block information and which block to ignore as block assignments are going to change later
  b.current <- temp$b
  
  for(j in split)
  {
    
    cutoff <- median(temp$e[b.current == j])
    ## We split units into two new blocks
    ## extract the index of units belonging to each new stratum
    idx.s <- which(b.current == j & temp$e < cutoff)
    idx.l <- which(b.current == j & temp$e > cutoff)
    ## randomly put half of the ties into one category
    idx.e <- which(temp$e == cutoff & b.current == j)
    n.tie <- length(idx.e)
    if (n.tie >= 1) {
      if (n.tie > 1) {
        idx.e <- sample(idx.e)
        idx.s <- c(idx.s, idx.e[1:round(n.tie/2)])
      }
      idx.l <- c(idx.l, idx.e[(round(n.tie/2)+ 1):n.tie])
    }
      
    ## we split only when new stratum has at least size number of control/treated units
    if (sum(temp$treat[idx.s]==1) > size && sum(temp$treat[idx.s]==0) > size && 
        sum(temp$treat[idx.l]==1) > size && sum(temp$treat[idx.l]==0) > size) {
      # anything above the current will have to be moved up 1
      temp$b[b.current > j] <- temp$b[b.current > j] + 1
      ## the upper new stratum will also have the block idx added by 1
      temp$b[idx.l] <- temp$b[idx.l] + 1
    }
    ## We don't do anything if we do not want to split
  }
  
   # calculate t statistic for each block
  b <- max(temp$b)
  t.stat <- sapply(1:b, function(j) {
    t.test(x = temp$e[temp$treat == 1 & temp$b == j], 
                        y = temp$e[temp$treat == 0 & temp$b == j], var.equal=T)$statistic
  })
  
  ## Update condition
  # check whether ANY blocks are above t.max
  # AND are not too small
  condition <- any(abs(t.stat) > t.max)
}

lalonde$blocks <- temp$b
print("number of individuals per strata")
[1] "number of individuals per strata"
table(lalonde[, c("treat", "blocks")])
     blocks
treat   1   2   3   4
    0  67  28  14  57
    1  16  13  28 110
## check the range of estimated propensity scores
lps_blocks <- sapply(1:max(lalonde$blocks), 
                     function(j) range(temp$e[temp$b == j]))
eps_blocks <- exp(lps_blocks)/(1 + exp(lps_blocks))
colnames(eps_blocks) <- paste("strata", 1:4)
rownames(eps_blocks) <- c("min eps", "max eps")
eps_blocks
         strata 1  strata 2  strata 3  strata 4
min eps 0.1002783 0.2225951 0.4509483 0.6052111
max eps 0.2161957 0.4487064 0.6051661 0.7891728
balance_check <- apply(x, 2, function(v) NeymanSRE(lalonde$treat, v, lalonde$blocks)) 


temp <- data.frame(x = colnames(x), y = balance_check[3, ])
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
  geom_hline(yintercept = -1.96, color = "red")+ 
  geom_hline(yintercept = 1.96, color = "red") +
  xlab("") + ylab("t statistics") + theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 

We have better covariate balancing even with fewer blocks.

## CI from Neyman
result <- NeymanSRE(lalonde$treat, lalonde$re78, lalonde$blocks)
result <- c(result[1:2], c(result[1] - 1.96 * result[2], result[1] + 1.96 * result[2]))
names(result) <- c("est", "sd", "CI_lower", "CI_upper")
result
      est        sd  CI_lower  CI_upper 
 941.2842  886.9070 -797.0535 2679.6219 
---
title: 'R Example 7: propensity score stratification with lalonde data'
output: html_notebook
---

We use the Lalonde data that we encountered in HW2 and will encounter again in HW4. Here we focus on the observational data of the job training program. The job training program took place in 1976, and we are interested in its impact on 1978 earnings.


# Initial assessment

We obtain the data from a software called MatchIt. This is a subset of the data that you will deal with in HW4.

- Read data

```{r}
library("MatchIt")
data("lalonde")
lalonde
```

- Check initial covariate balancing  
```{r}
model <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
x <- model.matrix(model)[, -1]
n <- table(lalonde$treat)
z <- lalonde$treat
data_stat <- data.frame(t(apply(x, 2, function(x) c(mean(x[z==0]), sd(x[z==0]), mean(x[z==1]), sd(x[z==1])))))
colnames(data_stat) <- c("Mean control", "S.D. control", "Mean treated", "S.D. treated")
data_stat$t_stat <- (data_stat[, 3] - data_stat[, 1])/sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
signif(data_stat, 2)
```

- Visualize covariates balancing

We use t statistics here as the mean differences for different variables do not have comparable scales

```{r}
library("ggplot2")
t.stat <- (data_stat[, 3] - data_stat[, 1])/sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
temp <- data.frame(x = colnames(x), y = t.stat)
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
  geom_hline(yintercept = -1.96, color = "red")+ 
  geom_hline(yintercept = 1.96, color = "red") +
  xlab("") + ylab("t statistics") + theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 
```

T statistics show that the mean differences of many variables are significantly non-zero.


# Stratification based on the estimated propensity score

- Estimate propensity score

```{r}
model <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
summary(model)
```


- Check overlapping of the estimated propensity score

```{r}
library(ggplot2)
pscore <- model$fitted.values
temp.data <- data.frame(eps = pscore, treated = as.factor(lalonde$treat))
ggplot(temp.data, 
       aes(x = eps, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + 
  xlab("Estimated propensity score") 
```
Overlap is extremely poor. Many control units have low propensity score. 


- Trimming to improve overlapping
```{r}
rm.idx <- which(pscore < 0.1 | pscore > 0.9)
idx.treated <- which((lalonde$treat == 1) & (pscore > max(pscore[lalonde$treat == 0])))
idx.control <- which((lalonde$treat == 0) & (pscore < min(pscore[lalonde$treat == 1])))
rm.idx <- union(rm.idx, c(idx.treated, idx.control))

lalonde <- lalonde[-rm.idx, ]
pscore <- pscore[-rm.idx]
lps <- predict(model)[-rm.idx]
x <- x[-rm.idx, ]

lalonde
```

Almost half of the units get trimmed out

- Stratification

Stratify the units into K = 5 equal strata and check covariates balancing

```{r}
NeymanSRE <- function(W, Y, strata.labels) {
  n <- length(W)
  groups <- unique(strata.labels)
  ests <- sapply(groups, function(gg) {
   wt <- sum(strata.labels == gg)/n
   est <- mean(Y[strata.labels == gg & W == 1]) - mean(Y[strata.labels == gg & W == 0])
   est.var <- var(Y[strata.labels == gg & W == 1])/sum(strata.labels == gg & W == 1) + 
     var(Y[strata.labels == gg & W == 0])/sum(strata.labels == gg & W == 0)
   return(c(wt, est, est.var))
 })
# print(ests)
 neyman.est <- sum(ests[1, ] * ests[2, ])
 neyman.var <- sum(ests[1, ]^2 * ests[3, ])
 return(c(est = neyman.est, se = sqrt(neyman.var), 
          tstat = neyman.est/sqrt(neyman.var)))
}

nn <- 5
q.pscore <- quantile(pscore , (1:( nn -1)) / nn)
ps.strata <- cut(pscore, breaks = c(0 , q.pscore ,1), labels = 1:nn)
balance_check <- apply(x, 2, function(v) NeymanSRE(lalonde$treat, v, ps.strata)) 


temp <- data.frame(x = colnames(x), y = balance_check[3, ])
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
  geom_hline(yintercept = -1.96, color = "red")+ 
  geom_hline(yintercept = 1.96, color = "red") +
  xlab("") + ylab("t statistics") + theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 
```

Covariates get much better balanced.

- We also perform sequencing splitting to find the stratas

```{r}
## First, create a data frame to store the current grouping information
temp <- data.frame(e = lps, treat = lalonde$treat, b = 1)

t.max <-  1.96
# check whether t. stat is above t.max for first iteration
t.stat <- t.test(x = temp$e[temp$treat == 1], y =  temp$e[temp$treat == 0], var.equal=T)$statistic
condition = t.stat > t.max
# minimum n.t or n.c in each block
size <- 3
size.new <- 20

# continue until all t statistics are below t.max
set.seed(0)
while(condition)
{
  # calculate size in each group
  # we want to not split a block if it is too small
  b <- max(temp$b)
  ignore <- sapply(1:b, function(j) {
     n.t <- sum(temp$treat == 1 & temp$b == j)
     n.c <- sum(temp$treat == 0 & temp$b == j)
     return(n.t < size | n.c < size | (n.t + n.c < size.new * 2))
  })
  
  
  # split unbalanced blocks into more blocks
  split <- which((abs(t.stat) > t.max) & (!ignore))
  
  if(length(split) == 0)
    break

  
  ## we need to keep a current copy of the block information and which block to ignore as block assignments are going to change later
  b.current <- temp$b
  
  for(j in split)
  {
    
    cutoff <- median(temp$e[b.current == j])
    ## We split units into two new blocks
    ## extract the index of units belonging to each new stratum
    idx.s <- which(b.current == j & temp$e < cutoff)
    idx.l <- which(b.current == j & temp$e > cutoff)
    ## randomly put half of the ties into one category
    idx.e <- which(temp$e == cutoff & b.current == j)
    n.tie <- length(idx.e)
    if (n.tie >= 1) {
      if (n.tie > 1) {
        idx.e <- sample(idx.e)
        idx.s <- c(idx.s, idx.e[1:round(n.tie/2)])
      }
      idx.l <- c(idx.l, idx.e[(round(n.tie/2)+ 1):n.tie])
    }
      
    ## we split only when new stratum has at least size number of control/treated units
    if (sum(temp$treat[idx.s]==1) > size && sum(temp$treat[idx.s]==0) > size && 
        sum(temp$treat[idx.l]==1) > size && sum(temp$treat[idx.l]==0) > size) {
      # anything above the current will have to be moved up 1
      temp$b[b.current > j] <- temp$b[b.current > j] + 1
      ## the upper new stratum will also have the block idx added by 1
      temp$b[idx.l] <- temp$b[idx.l] + 1
    }
    ## We don't do anything if we do not want to split
  }
  
   # calculate t statistic for each block
  b <- max(temp$b)
  t.stat <- sapply(1:b, function(j) {
    t.test(x = temp$e[temp$treat == 1 & temp$b == j], 
                        y = temp$e[temp$treat == 0 & temp$b == j], var.equal=T)$statistic
  })
  
  ## Update condition
  # check whether ANY blocks are above t.max
  # AND are not too small
  condition <- any(abs(t.stat) > t.max)
}

lalonde$blocks <- temp$b
print("number of individuals per strata")
table(lalonde[, c("treat", "blocks")])

## check the range of estimated propensity scores
lps_blocks <- sapply(1:max(lalonde$blocks), 
                     function(j) range(temp$e[temp$b == j]))
eps_blocks <- exp(lps_blocks)/(1 + exp(lps_blocks))
colnames(eps_blocks) <- paste("strata", 1:4)
rownames(eps_blocks) <- c("min eps", "max eps")
eps_blocks
```

- Check covariates balancing again

```{r}
balance_check <- apply(x, 2, function(v) NeymanSRE(lalonde$treat, v, lalonde$blocks)) 


temp <- data.frame(x = colnames(x), y = balance_check[3, ])
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
  geom_hline(yintercept = -1.96, color = "red")+ 
  geom_hline(yintercept = 1.96, color = "red") +
  xlab("") + ylab("t statistics") + theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 
```
We have better covariate balancing even with fewer blocks.

- Next, we treat the data as from a stratified randomized experiment and use Neyman's estimator

```{r}
## CI from Neyman
result <- NeymanSRE(lalonde$treat, lalonde$re78, lalonde$blocks)
result <- c(result[1:2], c(result[1] - 1.96 * result[2], result[1] + 1.96 * result[2]))
names(result) <- c("est", "sd", "CI_lower", "CI_upper")
result
```


