We use a software called MatchIt to perform matching for us. Our example here mostly follow the tutorial of MatchIt package here[https://kosukeimai.github.io/MatchIt/articles/MatchIt.html].

We use the Lalonde data that we encountered in HW2 and will encounter again in HW4. Here we focus on the observational data of the job training program. The job training program took place in 1976, and we are interested in its impact on 1978 earnings.

library("MatchIt")
data("lalonde")
lalonde
m.out0 <- matchit(treat ~ age + educ + race + married + 
                   nodegree + re74 + re75, data = lalonde,
                 method = NULL, distance = "glm")
## distance here refer to a method to estimate the propensity score
## No matching is performed at this stage
summary(m.out0)

Call:
matchit(formula = treat ~ age + educ + race + married + nodegree + 
    re74 + re75, data = lalonde, method = NULL, distance = "glm")

Summary of Balance for All Data:
           Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean eCDF Max
distance          0.5774        0.1822          1.7941     0.9211    0.3774   0.6444
age              25.8162       28.0303         -0.3094     0.4400    0.0813   0.1577
educ             10.3459       10.2354          0.0550     0.4959    0.0347   0.1114
raceblack         0.8432        0.2028          1.7615          .    0.6404   0.6404
racehispan        0.0595        0.1422         -0.3498          .    0.0827   0.0827
racewhite         0.0973        0.6550         -1.8819          .    0.5577   0.5577
married           0.1892        0.5128         -0.8263          .    0.3236   0.3236
nodegree          0.7081        0.5967          0.2450          .    0.1114   0.1114
re74           2095.5737     5619.2365         -0.7211     0.5181    0.2248   0.4470
re75           1532.0553     2466.4844         -0.2903     0.9563    0.1342   0.2876


Sample Sizes:
          Control Treated
All           429     185
Matched       429     185
Unmatched       0       0
Discarded       0       0
## Love plot before matching
plot(summary(m.out0), abs = F)

## Default thresholds: solid line 0.1, dashed line 0.05

– Add linear terms

Set re74 is the base covariate that we know that we need to add a priori

temp <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
temp0 <- glm(treat ~re74, data = lalonde[, -9], family = "binomial")
library(MASS)
## can set trace = F to have more concise messages
model <- stepAIC(temp0, scope=list(upper = formula(temp), lower = formula(temp0)), direction = "forward")
Start:  AIC=708.54
treat ~ re74

           Df Deviance    AIC
+ race      2   504.05 512.05
+ married   1   675.29 681.29
<none>          704.54 708.54
+ nodegree  1   702.77 708.77
+ educ      1   702.85 708.85
+ re75      1   704.15 710.15
+ age       1   704.23 710.23

Step:  AIC=512.05
treat ~ re74 + race

           Df Deviance    AIC
+ married   1   496.34 506.34
+ educ      1   501.25 511.25
<none>          504.05 512.05
+ re75      1   503.72 513.72
+ nodegree  1   503.82 513.82
+ age       1   503.94 513.94

Step:  AIC=506.34
treat ~ re74 + race + married

           Df Deviance    AIC
<none>          496.34 506.34
+ educ      1   494.55 506.55
+ re75      1   494.92 506.92
+ nodegree  1   495.94 507.94
+ age       1   496.00 508.00
summary(model)

Call:
glm(formula = treat ~ re74 + race + married, family = "binomial", 
    data = lalonde[, -9])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5694  -0.4638  -0.3033   0.8308   2.6076  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  8.863e-01  1.565e-01   5.663 1.48e-08 ***
re74        -4.949e-05  2.275e-05  -2.176  0.02958 *  
racehispan  -2.150e+00  3.596e-01  -5.979 2.25e-09 ***
racewhite   -3.062e+00  2.826e-01 -10.833  < 2e-16 ***
married     -7.262e-01  2.631e-01  -2.760  0.00578 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 751.49  on 613  degrees of freedom
Residual deviance: 496.34  on 609  degrees of freedom
AIC: 506.34

Number of Fisher Scoring iterations: 5

– Add interactions

tempfull <- glm(treat ~(race + married + re74)^2 + I(re74^2), data = lalonde[, -9], family = "binomial")
library(MASS)
model <- stepAIC(model, scope=list(upper = formula(tempfull), lower = formula(model)), direction = "forward")
Start:  AIC=506.34
treat ~ re74 + race + married

               Df Deviance    AIC
+ race:re74     2   489.63 503.63
+ I(re74^2)     1   491.84 503.84
<none>              496.34 506.34
+ race:married  2   492.51 506.51
+ married:re74  1   496.04 508.04

Step:  AIC=503.63
treat ~ re74 + race + married + re74:race

               Df Deviance    AIC
+ I(re74^2)     1   486.57 502.57
<none>              489.63 503.63
+ married:re74  1   489.55 505.55
+ race:married  2   488.29 506.29

Step:  AIC=502.57
treat ~ re74 + race + married + I(re74^2) + re74:race

               Df Deviance    AIC
<none>              486.57 502.57
+ married:re74  1   485.92 503.92
+ race:married  2   485.13 505.13
summary(model)

Call:
glm(formula = treat ~ re74 + race + married + I(re74^2) + re74:race, 
    family = "binomial", data = lalonde[, -9])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5612  -0.5542  -0.2251   0.8372   3.0364  

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)      8.683e-01  1.664e-01   5.218 1.81e-07 ***
re74            -1.148e-04  6.260e-05  -1.834   0.0667 .  
racehispan      -2.034e+00  4.175e-01  -4.871 1.11e-06 ***
racewhite       -2.642e+00  3.183e-01  -8.300  < 2e-16 ***
married         -6.451e-01  2.665e-01  -2.420   0.0155 *  
I(re74^2)        4.976e-09  3.270e-09   1.522   0.1281    
re74:racehispan -2.781e-05  7.658e-05  -0.363   0.7165    
re74:racewhite  -1.644e-04  9.291e-05  -1.770   0.0768 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 751.49  on 613  degrees of freedom
Residual deviance: 486.57  on 606  degrees of freedom
AIC: 502.57

Number of Fisher Scoring iterations: 7
## obtain linearized propensity scores
lps <- predict(model)

– Drop control and treated units that are out of the estimated propenstiy score range

library(ggplot2)
temp.data <- data.frame(lps = lps, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = lps, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + 
  xlab("Linearized propensity score") 


idx.treated <- which((lalonde$treat == 1) & (lps > max(lps[lalonde$treat == 0])))
idx.control <- which((lalonde$treat == 0) & (lps < min(lps[lalonde$treat == 1])))
idx <- c(idx.treated, idx.control)  ## dropped two treated units and 77 control units
lalonde <- lalonde[-idx, ]
lps <- lps[-idx]

From the histogram, we see a potential issue here, which is that there are not enough controls that can be potentially matched with the treated units for units whose lps > 0.

Greedy algorithm, covariate balancing is not satisfying

## Greedy algorithm
m.out1 <- matchit(treat ~ age + educ + race + married + 
                   nodegree + re74 + re75, data = lalonde,
                  estimand = "ATT", 
                 method = "nearest", distance = lps)
plot(summary(m.out1), abs = F)