We still use the lalonde data from the MatchIt package and use the propensity score model that we found out in R example 5
library("MatchIt")
data("lalonde")
model <- glm(treat ~ re74 + race + married + I(re74^2) + re74:race, data = lalonde, family = "binomial")
eps <- predict(model, type = "response")
lalonde
Then we perform a first check of the weights by drawing the histogram or boxplot of the weights. At this moment, the figures are not very informative as there are units with extremely large weights.
## Calculate the raw weights (before normalization)
## Weights to estimate ATE
n.treated <- sum(lalonde$treat == 1)
n.control <- sum(lalonde$treat == 0)
weights <- ifelse(lalonde$treat == 1, 1/eps, 1/(1 - eps))
## Check the weights histogram
library(ggplot2)
temp.data <- data.frame(weights = weights, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = weights, fill = treated, color = treated)) +
geom_histogram(alpha = 0.5, position = "identity") +
xlab("Weights")
## We can also draw boxplots
ggplot(temp.data, aes(x = treated, y = weights, color = treated)) +
geom_boxplot()
- Check covariate balancing. If we do not perform any trimming and just look at the full data, we can check covariate balancing from the love plot or summary data of the standardized mean difference. Here, we need to write our own function to draw the love plot.
## Need to change race (categorical) into indicators (numerical)
lalonde$black <- lalonde$race == "black"
lalonde$hispan <- lalonde$race == "hispan"
lalonde$white <- lalonde$race == "white"
## Draw love plot
love.plot = function(cov, treat, ## cov is the matrix of covariates and treat is a vector of treatment assignment
weights = rep(1, length(treat)),
plot = F)
{
## mean with normalized weights \sum w_i x_i / (\sum w_i)
treat.means <- colSums(cov[treat == 1,] * weights[treat == 1])/sum(weights[treat == 1])
treat.var <- colSums(t(t(cov[treat == 1,]) - treat.means)^2 *
weights[treat == 1])/sum(weights[treat == 1])
control.means <- colSums(cov[treat == 0,] * weights[treat == 0])/sum(weights[treat == 0])
control.var <- colSums(t(t(cov[treat == 0,]) - control.means)^2 *
weights[treat == 0])/sum(weights[treat == 0])
## the standardized mean differences for every covariate
smd <- (treat.means - control.means)/sqrt((treat.var + control.var)/2)
names(smd) <- colnames(cov)
if (plot == T) {
plot.data <- data.frame(smd = smd, covariates = names(smd))
range <- max(abs(smd))
ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates)) +
geom_vline(xintercept = 0) + xlim(-range, range) +
labs(x = 'Standardized Difference in Means')
}
return(smd)
}
colnames(lalonde)
[1] "treat" "age" "educ" "race" "married" "nodegree" "re74" "re75"
[9] "re78" "black" "hispan" "white"
raw.smd <- love.plot(lalonde[, c(2:3, 5:9, 10:12)], lalonde$treat)
weighted.smd <- love.plot(lalonde[, c(2:3, 5:9, 10:12)], lalonde$treat, weights = weights)
plot.data <- data.frame(smd = c(raw.smd, weighted.smd),
covariates = c(names(raw.smd), names(weighted.smd)),
category = c(rep("Original", length(raw.smd)), rep("IPW", length(weighted.smd))))
range <- max(abs(plot.data$smd))
ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates, color = category)) +
geom_vline(xintercept = c(-0.1, -0.05, 0, 0.05, 0.1),
linetype = c("solid", "dashed", "solid", "dashed", "solid")) +
xlim(-range, range) +
labs(x = 'Standardized Difference in Means')
An alternative approach (no love plot) is to use the survey package to create a summary table
library(survey)
## Another simplier way to check balance (no love plot)
weighteddata <- svydesign(ids = ~ 1, data = lalonde, weights = ~weights)
weightedtable <- svyCreateTableOne(vars = colnames(lalonde)[2:8], strata = "treat", data = weighteddata, test = F)
print(weightedtable, smd = T) ## The first row is the summation of weights
Stratified by treat
0 1 SMD
n 615.43 586.79
age (mean (SD)) 27.01 (10.81) 25.46 (6.20) 0.177
educ (mean (SD)) 10.15 (2.84) 10.95 (1.93) 0.328
race (%) 0.043
black 244.6 (39.7) 243.3 (41.5)
hispan 72.0 (11.7) 71.2 (12.1)
white 298.8 (48.6) 272.3 (46.4)
married (mean (SD)) 0.41 (0.49) 0.35 (0.48) 0.110
nodegree (mean (SD)) 0.62 (0.49) 0.52 (0.50) 0.199
re74 (mean (SD)) 4508.71 (6374.58) 3678.23 (4795.67) 0.147
re75 (mean (SD)) 2090.95 (3097.31) 1919.51 (3273.26) 0.054
Now we perform trimming and check covariate balancing again
## Histogram of estimated propensity score
temp.data <- data.frame(eps = eps, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = eps, fill = treated, color = treated)) +
geom_histogram(alpha = 0.5, position = "identity") + xlim(c(0, 1)) +
ggtitle("Histogram of eps before trimming")
rm.idx <- which(eps < 0.1 | eps > 0.9)
## remove control units
length(rm.idx)
[1] 237
## Check the histogram of eps again
ggplot(temp.data[-rm.idx, ], aes(x = eps, fill = treated, color = treated)) +
geom_histogram(alpha = 0.5, position = "identity") + xlim(c(0, 1)) +
ggtitle("Histogram of eps after trimming")
temp.data <- data.frame(weights = weights, treated = as.factor(lalonde$treat))
## We can also draw boxplots
ggplot(temp.data[-rm.idx, ], aes(x = treated, y = weights, color = treated)) +
geom_boxplot()
Check covariate balancing after trimming. Covariate balancing is much better.
raw.smd <- love.plot(lalonde[-rm.idx, c(2:3, 5:9, 10:12)], lalonde$treat[-rm.idx])
weighted.smd <- love.plot(lalonde[-rm.idx, c(2:3, 5:9, 10:12)], lalonde$treat[-rm.idx],
weights = weights[-rm.idx])
plot.data <- data.frame(smd = c(raw.smd, weighted.smd),
covariates = c(names(raw.smd), names(weighted.smd)),
category = c(rep("Original", length(raw.smd)), rep("IPW", length(weighted.smd))))
range <- max(abs(plot.data$smd))
ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates, color = category)) +
geom_vline(xintercept = c(-0.1, -0.05, 0, 0.05, 0.1),
linetype = c("solid", "dashed", "solid", "dashed", "solid")) +
xlim(-range, range) +
labs(x = 'Standardized Difference in Means')
- Estimate causal effect by weighted least square regression and estimate the variance by Sandwich estimator
lm.result <- lm(re78 ~ treat, weights = weights[-rm.idx], data = lalonde[-rm.idx, ])
summary(lm.result)
Call:
lm(formula = re78 ~ treat, data = lalonde[-rm.idx, ], weights = weights[-rm.idx])
Weighted Residuals:
Min 1Q Median 3Q Max
-17233 -6644 -2553 4239 64304
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5135.9 480.0 10.700 <2e-16 ***
treat 1203.2 676.4 1.779 0.0761 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9335 on 375 degrees of freedom
Multiple R-squared: 0.008368, Adjusted R-squared: 0.005723
F-statistic: 3.164 on 1 and 375 DF, p-value: 0.07607
library(sandwich)
SE <- sqrt(diag(vcovHC(lm.result, type = "HC2")))[2]
## get the 95% CI
result <- c(lm.result$coefficients[2], SE, c(tau_hat- 1.96 * SE, tau_hat + 1.96 * SE))
names(result) <- c("est", "sd", "CI_lower", "CI_upper")
result
est sd CI_lower CI_upper
1203.2270 768.3267 -393.7199 2618.1209
---
title: "R Example 6: Inverse propensity score weighting"
output: html_notebook
---

We still use the lalonde data from the MatchIt package and use the propensity score model that we found out in R example 5

```{r}
library("MatchIt")
data("lalonde")

model <- glm(treat ~ re74 + race + married + I(re74^2) + re74:race, data = lalonde, family = "binomial")
eps <- predict(model, type = "response")

lalonde
```

Then we perform a first check of the weights by drawing the histogram or boxplot of the weights. At this moment, the figures are not very informative as there are units with extremely large weights.
```{r}
## Calculate the raw weights (before normalization)
## Weights to estimate ATE
n.treated <- sum(lalonde$treat == 1)
n.control <- sum(lalonde$treat == 0)
weights <- ifelse(lalonde$treat == 1, 1/eps, 1/(1 - eps))

## Check the weights histogram
library(ggplot2)
temp.data <- data.frame(weights = weights, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = weights, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + 
  xlab("Weights") 

## We can also draw boxplots
ggplot(temp.data, aes(x = treated, y = weights, color = treated)) + 
  geom_boxplot() 
```

- Check covariate balancing.
If we do not perform any trimming and just look at the full data, we can check covariate balancing from the love plot or summary data of the standardized mean difference. Here, we need to write our own function to draw the love plot. 
```{r}
## Need to change race (categorical) into indicators (numerical)
lalonde$black <- lalonde$race == "black"
lalonde$hispan <- lalonde$race == "hispan"
lalonde$white <- lalonde$race == "white"

## Draw love plot
love.plot = function(cov, treat,  ## cov is the matrix of covariates and treat is a vector of treatment assignment
                     weights = rep(1, length(treat)),
                     plot = F) 
{
    
    ## mean with normalized weights \sum w_i x_i / (\sum w_i)
  treat.means <- colSums(cov[treat == 1,] * weights[treat == 1])/sum(weights[treat == 1])
  treat.var <- colSums(t(t(cov[treat == 1,]) - treat.means)^2 *
                          weights[treat == 1])/sum(weights[treat == 1])
  
  control.means <- colSums(cov[treat == 0,] * weights[treat == 0])/sum(weights[treat == 0])
  control.var <- colSums(t(t(cov[treat == 0,]) - control.means)^2 *
                          weights[treat == 0])/sum(weights[treat == 0])
  
  ## the standardized mean differences for every covariate
  smd <- (treat.means - control.means)/sqrt((treat.var + control.var)/2)
  names(smd) <- colnames(cov)
  
  if (plot == T) {
    plot.data <- data.frame(smd = smd, covariates = names(smd))
    range <- max(abs(smd))
    ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates)) +
      geom_vline(xintercept = 0) + xlim(-range, range) +
      labs(x = 'Standardized Difference in Means')
  }
  return(smd)
}

colnames(lalonde)
raw.smd <- love.plot(lalonde[, c(2:3, 5:9, 10:12)], lalonde$treat)
weighted.smd <- love.plot(lalonde[, c(2:3, 5:9, 10:12)], lalonde$treat, weights = weights)


plot.data <- data.frame(smd = c(raw.smd, weighted.smd), 
                        covariates = c(names(raw.smd), names(weighted.smd)),
                        category = c(rep("Original", length(raw.smd)), rep("IPW", length(weighted.smd))))
range <- max(abs(plot.data$smd))

ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates, color = category)) +
      geom_vline(xintercept = c(-0.1, -0.05, 0, 0.05, 0.1),
                 linetype = c("solid", "dashed", "solid", "dashed", "solid")) + 
      xlim(-range, range) +
      labs(x = 'Standardized Difference in Means')
```

An alternative approach (no love plot) is to use the survey package to create a summary table
```{r}
library(survey)
## Another simplier way to check balance (no love plot)
weighteddata <- svydesign(ids = ~ 1, data = lalonde, weights = ~weights)
weightedtable <- svyCreateTableOne(vars = colnames(lalonde)[2:8], strata = "treat", data = weighteddata, test = F)
print(weightedtable, smd = T) ## The first row is the summation of weights
```

Now we perform trimming and check covariate balancing again
```{r}
## Histogram of estimated propensity score
temp.data <- data.frame(eps = eps, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = eps, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + xlim(c(0, 1)) +
  ggtitle("Histogram of eps before trimming")

rm.idx <- which(eps < 0.1 | eps > 0.9)
## remove control units
length(rm.idx)

## Check the histogram of eps again
ggplot(temp.data[-rm.idx, ], aes(x = eps, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + xlim(c(0, 1)) +
  ggtitle("Histogram of eps after trimming")


temp.data <- data.frame(weights = weights, treated = as.factor(lalonde$treat))
## We can also draw boxplots
ggplot(temp.data[-rm.idx, ], aes(x = treated, y = weights, color = treated)) + 
  geom_boxplot() 
```

Check covariate balancing after trimming. Covariate balancing is much better.
```{r}
raw.smd <- love.plot(lalonde[-rm.idx, c(2:3, 5:9, 10:12)], lalonde$treat[-rm.idx])
weighted.smd <- love.plot(lalonde[-rm.idx, c(2:3, 5:9, 10:12)], lalonde$treat[-rm.idx], 
                          weights = weights[-rm.idx])
plot.data <- data.frame(smd = c(raw.smd, weighted.smd), 
                        covariates = c(names(raw.smd), names(weighted.smd)),
                        category = c(rep("Original", length(raw.smd)), rep("IPW", length(weighted.smd))))
range <- max(abs(plot.data$smd))
ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates, color = category)) +
      geom_vline(xintercept = c(-0.1, -0.05, 0, 0.05, 0.1),
                 linetype = c("solid", "dashed", "solid", "dashed", "solid")) + 
      xlim(-range, range) +
      labs(x = 'Standardized Difference in Means')
```

- Estimate causal effect by weighted least square regression and estimate the variance by Sandwich estimator

```{r}
lm.result <- lm(re78 ~ treat, weights = weights[-rm.idx], data = lalonde[-rm.idx, ])
summary(lm.result)

library(sandwich)
SE <- sqrt(diag(vcovHC(lm.result, type = "HC2")))[2]

## get the 95% CI
result <- c(lm.result$coefficients[2], SE, c(tau_hat- 1.96 * SE, tau_hat + 1.96 * SE))
names(result) <- c("est", "sd", "CI_lower", "CI_upper")
result
```
