We use the Lalonde data that we encountered in HW2 and will encounter again in HW4. Here we focus on the observational data of the job training program. The job training program took place in 1976, and we are interested in its impact on 1978 earnings.
Initial assessment
We obtain the data from a software called MatchIt. This is a subset of the data that you will deal with in HW4.
library("MatchIt")
There were 16 warnings (use warnings() to see them)
data("lalonde")
lalonde
- Check initial covariate balancing
model <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
x <- model.matrix(model)[, -1]
n <- table(lalonde$treat)
z <- lalonde$treat
data_stat <- data.frame(t(apply(x, 2, function(x) c(mean(x[z==0]), sd(x[z==0]), mean(x[z==1]), sd(x[z==1])))))
colnames(data_stat) <- c("Mean control", "S.D. control", "Mean treated", "S.D. treated")
data_stat$t_stat <- (data_stat[, 3] - data_stat[, 1])/sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
signif(data_stat, 2)
- Visualize covariates balancing
We use t statistics here as the mean differences for different variables do not have comparable scales
library("ggplot2")
t.stat <- (data_stat[, 3] - data_stat[, 1])/sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
temp <- data.frame(x = colnames(x), y = t.stat)
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
geom_hline(yintercept = -1.96, color = "red")+
geom_hline(yintercept = 1.96, color = "red") +
xlab("") + ylab("t statistics") + theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))
T statistics show that the mean differences of many variables are significantly non-zero.
Stratification based on the estimated propensity score
- Estimate propensity score
model <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
summary(model)
Call:
glm(formula = treat ~ ., family = "binomial", data = lalonde[,
-9])
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7645 -0.4736 -0.2862 0.7508 2.7169
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.663e+00 9.709e-01 -1.713 0.08668 .
age 1.578e-02 1.358e-02 1.162 0.24521
educ 1.613e-01 6.513e-02 2.477 0.01325 *
racehispan -2.082e+00 3.672e-01 -5.669 1.44e-08 ***
racewhite -3.065e+00 2.865e-01 -10.699 < 2e-16 ***
married -8.321e-01 2.903e-01 -2.866 0.00415 **
nodegree 7.073e-01 3.377e-01 2.095 0.03620 *
re74 -7.178e-05 2.875e-05 -2.497 0.01253 *
re75 5.345e-05 4.635e-05 1.153 0.24884
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 751.49 on 613 degrees of freedom
Residual deviance: 487.84 on 605 degrees of freedom
AIC: 505.84
Number of Fisher Scoring iterations: 5
- Check overlapping of the estimated propensity score
library(ggplot2)
pscore <- model$fitted.values
temp.data <- data.frame(eps = pscore, treated = as.factor(lalonde$treat))
ggplot(temp.data,
aes(x = eps, fill = treated, color = treated)) +
geom_histogram(alpha = 0.5, position = "identity") +
xlab("Estimated propensity score")
Overlap is extremely poor. Many control units have low propensity score.
- Trimming to improve overlapping
rm.idx <- which(pscore < 0.1 | pscore > 0.9)
idx.treated <- which((lalonde$treat == 1) & (pscore > max(pscore[lalonde$treat == 0])))
idx.control <- which((lalonde$treat == 0) & (pscore < min(pscore[lalonde$treat == 1])))
rm.idx <- union(rm.idx, c(idx.treated, idx.control))
lalonde <- lalonde[-rm.idx, ]
pscore <- pscore[-rm.idx]
lps <- predict(model)[-rm.idx]
x <- x[-rm.idx, ]
lalonde
Almost half of the units get trimmed out
Stratify the units into K = 5 equal strata and check covariates balancing
NeymanSRE <- function(W, Y, strata.labels) {
n <- length(W)
groups <- unique(strata.labels)
ests <- sapply(groups, function(gg) {
wt <- sum(strata.labels == gg)/n
est <- mean(Y[strata.labels == gg & W == 1]) - mean(Y[strata.labels == gg & W == 0])
est.var <- var(Y[strata.labels == gg & W == 1])/sum(strata.labels == gg & W == 1) +
var(Y[strata.labels == gg & W == 0])/sum(strata.labels == gg & W == 0)
return(c(wt, est, est.var))
})
# print(ests)
neyman.est <- sum(ests[1, ] * ests[2, ])
neyman.var <- sum(ests[1, ]^2 * ests[3, ])
return(c(est = neyman.est, se = sqrt(neyman.var),
tstat = neyman.est/sqrt(neyman.var)))
}
nn <- 5
q.pscore <- quantile(pscore , (1:( nn -1)) / nn)
ps.strata <- cut(pscore, breaks = c(0 , q.pscore ,1), labels = 1:nn)
balance_check <- apply(x, 2, function(v) NeymanSRE(lalonde$treat, v, ps.strata))
temp <- data.frame(x = colnames(x), y = balance_check[3, ])
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
geom_hline(yintercept = -1.96, color = "red")+
geom_hline(yintercept = 1.96, color = "red") +
xlab("") + ylab("t statistics") + theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))
Covariates get much better balanced.
- We also perform sequencing splitting to find the stratas
## First, create a data frame to store the current grouping information
temp <- data.frame(e = lps, treat = lalonde$treat, b = 1)
t.max <- 1.96
# check whether t. stat is above t.max for first iteration
t.stat <- t.test(x = temp$e[temp$treat == 1], y = temp$e[temp$treat == 0], var.equal=T)$statistic
condition = t.stat > t.max
# minimum n.t or n.c in each block
size <- 3
size.new <- 20
# continue until all t statistics are below t.max
set.seed(0)
while(condition)
{
# calculate size in each group
# we want to not split a block if it is too small
b <- max(temp$b)
ignore <- sapply(1:b, function(j) {
n.t <- sum(temp$treat == 1 & temp$b == j)
n.c <- sum(temp$treat == 0 & temp$b == j)
return(n.t < size | n.c < size | (n.t + n.c < size.new * 2))
})
# split unbalanced blocks into more blocks
split <- which((abs(t.stat) > t.max) & (!ignore))
if(length(split) == 0)
break
## we need to keep a current copy of the block information and which block to ignore as block assignments are going to change later
b.current <- temp$b
for(j in split)
{
cutoff <- median(temp$e[b.current == j])
## We split units into two new blocks
## extract the index of units belonging to each new stratum
idx.s <- which(b.current == j & temp$e < cutoff)
idx.l <- which(b.current == j & temp$e > cutoff)
## randomly put half of the ties into one category
idx.e <- which(temp$e == cutoff & b.current == j)
n.tie <- length(idx.e)
if (n.tie >= 1) {
if (n.tie > 1) {
idx.e <- sample(idx.e)
idx.s <- c(idx.s, idx.e[1:round(n.tie/2)])
}
idx.l <- c(idx.l, idx.e[(round(n.tie/2)+ 1):n.tie])
}
## we split only when new stratum has at least size number of control/treated units
if (sum(temp$treat[idx.s]==1) > size && sum(temp$treat[idx.s]==0) > size &&
sum(temp$treat[idx.l]==1) > size && sum(temp$treat[idx.l]==0) > size) {
# anything above the current will have to be moved up 1
temp$b[b.current > j] <- temp$b[b.current > j] + 1
## the upper new stratum will also have the block idx added by 1
temp$b[idx.l] <- temp$b[idx.l] + 1
}
## We don't do anything if we do not want to split
}
# calculate t statistic for each block
b <- max(temp$b)
t.stat <- sapply(1:b, function(j) {
t.test(x = temp$e[temp$treat == 1 & temp$b == j],
y = temp$e[temp$treat == 0 & temp$b == j], var.equal=T)$statistic
})
## Update condition
# check whether ANY blocks are above t.max
# AND are not too small
condition <- any(abs(t.stat) > t.max)
}
lalonde$blocks <- temp$b
print("number of individuals per strata")
[1] "number of individuals per strata"
table(lalonde[, c("treat", "blocks")])
blocks
treat 1 2 3 4
0 67 28 14 57
1 16 13 28 110
## check the range of estimated propensity scores
lps_blocks <- sapply(1:max(lalonde$blocks),
function(j) range(temp$e[temp$b == j]))
eps_blocks <- exp(lps_blocks)/(1 + exp(lps_blocks))
colnames(eps_blocks) <- paste("strata", 1:4)
rownames(eps_blocks) <- c("min eps", "max eps")
eps_blocks
strata 1 strata 2 strata 3 strata 4
min eps 0.1002783 0.2225951 0.4509483 0.6052111
max eps 0.2161957 0.4487064 0.6051661 0.7891728
- Check covariates balancing again
balance_check <- apply(x, 2, function(v) NeymanSRE(lalonde$treat, v, lalonde$blocks))
temp <- data.frame(x = colnames(x), y = balance_check[3, ])
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
geom_hline(yintercept = -1.96, color = "red")+
geom_hline(yintercept = 1.96, color = "red") +
xlab("") + ylab("t statistics") + theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))
We have better covariate balancing even with fewer blocks.
- Next, we treat the data as from a stratified randomized experiment and use Neyman’s estimator
## CI from Neyman
result <- NeymanSRE(lalonde$treat, lalonde$re78, lalonde$blocks)
result <- c(result[1:2], c(result[1] - 1.96 * result[2], result[1] + 1.96 * result[2]))
names(result) <- c("est", "sd", "CI_lower", "CI_upper")
result
est sd CI_lower CI_upper
941.2842 886.9070 -797.0535 2679.6219
---
title: 'R Example 7: propensity score stratification with lalonde data'
output: html_notebook
---

We use the Lalonde data that we encountered in HW2 and will encounter again in HW4. Here we focus on the observational data of the job training program. The job training program took place in 1976, and we are interested in its impact on 1978 earnings.


# Initial assessment

We obtain the data from a software called MatchIt. This is a subset of the data that you will deal with in HW4.

- Read data

```{r}
library("MatchIt")
data("lalonde")
lalonde
```

- Check initial covariate balancing  
```{r}
model <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
x <- model.matrix(model)[, -1]
n <- table(lalonde$treat)
z <- lalonde$treat
data_stat <- data.frame(t(apply(x, 2, function(x) c(mean(x[z==0]), sd(x[z==0]), mean(x[z==1]), sd(x[z==1])))))
colnames(data_stat) <- c("Mean control", "S.D. control", "Mean treated", "S.D. treated")
data_stat$t_stat <- (data_stat[, 3] - data_stat[, 1])/sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
signif(data_stat, 2)
```

- Visualize covariates balancing

We use t statistics here as the mean differences for different variables do not have comparable scales

```{r}
library("ggplot2")
t.stat <- (data_stat[, 3] - data_stat[, 1])/sqrt(data_stat[, 2]^2/n[1] + data_stat[, 4]^2/n[2])
temp <- data.frame(x = colnames(x), y = t.stat)
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
  geom_hline(yintercept = -1.96, color = "red")+ 
  geom_hline(yintercept = 1.96, color = "red") +
  xlab("") + ylab("t statistics") + theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 
```

T statistics show that the mean differences of many variables are significantly non-zero.


# Stratification based on the estimated propensity score

- Estimate propensity score

```{r}
model <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
summary(model)
```


- Check overlapping of the estimated propensity score

```{r}
library(ggplot2)
pscore <- model$fitted.values
temp.data <- data.frame(eps = pscore, treated = as.factor(lalonde$treat))
ggplot(temp.data, 
       aes(x = eps, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + 
  xlab("Estimated propensity score") 
```
Overlap is extremely poor. Many control units have low propensity score. 


- Trimming to improve overlapping
```{r}
rm.idx <- which(pscore < 0.1 | pscore > 0.9)
idx.treated <- which((lalonde$treat == 1) & (pscore > max(pscore[lalonde$treat == 0])))
idx.control <- which((lalonde$treat == 0) & (pscore < min(pscore[lalonde$treat == 1])))
rm.idx <- union(rm.idx, c(idx.treated, idx.control))

lalonde <- lalonde[-rm.idx, ]
pscore <- pscore[-rm.idx]
lps <- predict(model)[-rm.idx]
x <- x[-rm.idx, ]

lalonde
```

Almost half of the units get trimmed out

- Stratification

Stratify the units into K = 5 equal strata and check covariates balancing

```{r}
NeymanSRE <- function(W, Y, strata.labels) {
  n <- length(W)
  groups <- unique(strata.labels)
  ests <- sapply(groups, function(gg) {
   wt <- sum(strata.labels == gg)/n
   est <- mean(Y[strata.labels == gg & W == 1]) - mean(Y[strata.labels == gg & W == 0])
   est.var <- var(Y[strata.labels == gg & W == 1])/sum(strata.labels == gg & W == 1) + 
     var(Y[strata.labels == gg & W == 0])/sum(strata.labels == gg & W == 0)
   return(c(wt, est, est.var))
 })
# print(ests)
 neyman.est <- sum(ests[1, ] * ests[2, ])
 neyman.var <- sum(ests[1, ]^2 * ests[3, ])
 return(c(est = neyman.est, se = sqrt(neyman.var), 
          tstat = neyman.est/sqrt(neyman.var)))
}

nn <- 5
q.pscore <- quantile(pscore , (1:( nn -1)) / nn)
ps.strata <- cut(pscore, breaks = c(0 , q.pscore ,1), labels = 1:nn)
balance_check <- apply(x, 2, function(v) NeymanSRE(lalonde$treat, v, ps.strata)) 


temp <- data.frame(x = colnames(x), y = balance_check[3, ])
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
  geom_hline(yintercept = -1.96, color = "red")+ 
  geom_hline(yintercept = 1.96, color = "red") +
  xlab("") + ylab("t statistics") + theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 
```

Covariates get much better balanced.

- We also perform sequencing splitting to find the stratas

```{r}
## First, create a data frame to store the current grouping information
temp <- data.frame(e = lps, treat = lalonde$treat, b = 1)

t.max <-  1.96
# check whether t. stat is above t.max for first iteration
t.stat <- t.test(x = temp$e[temp$treat == 1], y =  temp$e[temp$treat == 0], var.equal=T)$statistic
condition = t.stat > t.max
# minimum n.t or n.c in each block
size <- 3
size.new <- 20

# continue until all t statistics are below t.max
set.seed(0)
while(condition)
{
  # calculate size in each group
  # we want to not split a block if it is too small
  b <- max(temp$b)
  ignore <- sapply(1:b, function(j) {
     n.t <- sum(temp$treat == 1 & temp$b == j)
     n.c <- sum(temp$treat == 0 & temp$b == j)
     return(n.t < size | n.c < size | (n.t + n.c < size.new * 2))
  })
  
  
  # split unbalanced blocks into more blocks
  split <- which((abs(t.stat) > t.max) & (!ignore))
  
  if(length(split) == 0)
    break

  
  ## we need to keep a current copy of the block information and which block to ignore as block assignments are going to change later
  b.current <- temp$b
  
  for(j in split)
  {
    
    cutoff <- median(temp$e[b.current == j])
    ## We split units into two new blocks
    ## extract the index of units belonging to each new stratum
    idx.s <- which(b.current == j & temp$e < cutoff)
    idx.l <- which(b.current == j & temp$e > cutoff)
    ## randomly put half of the ties into one category
    idx.e <- which(temp$e == cutoff & b.current == j)
    n.tie <- length(idx.e)
    if (n.tie >= 1) {
      if (n.tie > 1) {
        idx.e <- sample(idx.e)
        idx.s <- c(idx.s, idx.e[1:round(n.tie/2)])
      }
      idx.l <- c(idx.l, idx.e[(round(n.tie/2)+ 1):n.tie])
    }
      
    ## we split only when new stratum has at least size number of control/treated units
    if (sum(temp$treat[idx.s]==1) > size && sum(temp$treat[idx.s]==0) > size && 
        sum(temp$treat[idx.l]==1) > size && sum(temp$treat[idx.l]==0) > size) {
      # anything above the current will have to be moved up 1
      temp$b[b.current > j] <- temp$b[b.current > j] + 1
      ## the upper new stratum will also have the block idx added by 1
      temp$b[idx.l] <- temp$b[idx.l] + 1
    }
    ## We don't do anything if we do not want to split
  }
  
   # calculate t statistic for each block
  b <- max(temp$b)
  t.stat <- sapply(1:b, function(j) {
    t.test(x = temp$e[temp$treat == 1 & temp$b == j], 
                        y = temp$e[temp$treat == 0 & temp$b == j], var.equal=T)$statistic
  })
  
  ## Update condition
  # check whether ANY blocks are above t.max
  # AND are not too small
  condition <- any(abs(t.stat) > t.max)
}

lalonde$blocks <- temp$b
print("number of individuals per strata")
table(lalonde[, c("treat", "blocks")])

## check the range of estimated propensity scores
lps_blocks <- sapply(1:max(lalonde$blocks), 
                     function(j) range(temp$e[temp$b == j]))
eps_blocks <- exp(lps_blocks)/(1 + exp(lps_blocks))
colnames(eps_blocks) <- paste("strata", 1:4)
rownames(eps_blocks) <- c("min eps", "max eps")
eps_blocks
```

- Check covariates balancing again

```{r}
balance_check <- apply(x, 2, function(v) NeymanSRE(lalonde$treat, v, lalonde$blocks)) 


temp <- data.frame(x = colnames(x), y = balance_check[3, ])
ggplot(temp, aes(x, y)) + geom_point() + geom_hline(yintercept = 0) +
  geom_hline(yintercept = -1.96, color = "red")+ 
  geom_hline(yintercept = 1.96, color = "red") +
  xlab("") + ylab("t statistics") + theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 
```
We have better covariate balancing even with fewer blocks.

- Next, we treat the data as from a stratified randomized experiment and use Neyman's estimator

```{r}
## CI from Neyman
result <- NeymanSRE(lalonde$treat, lalonde$re78, lalonde$blocks)
result <- c(result[1:2], c(result[1] - 1.96 * result[2], result[1] + 1.96 * result[2]))
names(result) <- c("est", "sd", "CI_lower", "CI_upper")
result
```


