########## MATCHING EXAMPLES ############################################# ## USE KEN CHAY'S DATA library(MASS) library(foreign) library(Matching) ## Read data dta<-read.dta(file="W:/Teaching/Fall2006_PolSci239/Section_Notes/Section5/smoking2.dta") names(dta) dim(dta) # Check if there are NAs apply(dta,2,function(x) sum(is.na(x))) # Treatment variable: tobacco==> indicator equal to one if mother smokes # Outcome variables: # death==> indicator equal to one if infant died within 1 year of birth # dbirwr==> birth weight of the infant (in grams) # Covariates: dmage, dmeduc, mblack, etc. # Number of treatment observations sum(dta$tobacco) # Number of control observations nrow(dta)-sum(dta$tobacco) # Mean matching meanmatch<-function(dtreat,x) { N.tr<-length(x[dtreat==1]) # Number of treatments mean.tr<-mean(x[dtreat==1]) # Treatment mean sample.co<-numeric(N.tr) # Vector for sample of matched controls index<-1:length(x) # Index # Initialize dis=abs(mean.tr-x) frame1<-data.frame(index,x,dis,Tr)[order(dis),] for(i in 1:N.tr) { if (i>1) { new.mean<-sapply(x[-sample.co],function(x)(x+old.mean.co*(i-1))/i) dis<-abs(mean.tr-new.mean) frame1<-data.frame(index[-sample.co],x[-sample.co],dis,Tr[-sample.co])[order(dis),] } sample.co[i]<-subset(frame1,frame1[,4]==0)[1,1] old.mean.co<-mean(x[sample.co]) } list(matched.control.sample=sample.co) } # Since the data has too many observations, I take a sample set.seed(1234) # sample(nrow(dta),5000) # If 'x' has length 1, sampling takes place from '1:x'. # dta2<-dta[sample(nrow(dta),3000),] # Sample the rows of dta and then I index # Or let's take 1000 treatments and 2000 controls dta2<-rbind(subset(dta,dta$tobacco==1)[sample(sum(dta$tobacco==1), 1000, replace=FALSE),],subset(dta,dta$tobacco==0)[sample(sum(dta$tobacco==0), 2000, replace=FALSE),]) dim(dta2) names(dta2) Tr<-dta2$tobacco x<-dta2$dmeduc # So the mean matchign deals with ties randomly y<-dta2$dbirwt # Number of treatment observations sum(Tr) # Number of control observations length(Tr)-sum(Tr) rr.match1<-meanmatch(dtreat=Tr,x=x) summary(rr.match1) rr.match1$matched.control.sample # Selecting the N controls closest to the mean of treatments mean.tr<-mean(x[Tr==1]) N.tr<-sum(Tr==1) # Number of treatments dis=abs(mean.tr-x) sample.co<-order(dis)[Tr==0][1:N.tr] sample.co[1:5] rr.match1$matched.control.sample[1:5] # Compute the average treatment effect on treated att<-mean(y[Tr==1])-mean(y[rr.match1$matched.control.sample]) # Now compare this to a simple difference in means att2<-mean(y[Tr==1])-mean(y[Tr==0]) # More educated women smoke less ==> more educated mothers are more likely to have healthier children # Matching reduces this bias # Let's compare the balance with ttests # 1. Difference of means t.test(dta2$dmeduc[Tr==1],dta2$dmeduc[Tr==0]) t.test(dta2$dmeduc[Tr==1],dta2$dmeduc[rr.match1$matched.control.sample]) t.test(dta2$mblack[Tr==1],dta2$mblack[Tr==0]) t.test(dta2$mblack[Tr==1],dta2$mblack[rr.match1$matched.control.sample]) ## BUT WE ACTUALLY HAVE TO MAKE WEIGHTED T-TESTS==> THESE T-TESTS AFTER MATCHING ARE NOT VALID WITHOUT TAKING WEIGHTS INTO ACCOUNT ## WE WON'T WORRY ABOUT WEIGHTED T-TESTS FOR NOW, JUST COMPARE DIFFERENCE IN MEANS USING WEIGHTED MEANS # Still different means but it's better after matching # 2.QQ plots par(mfrow=c(1,2)) qqplot(dta2$dmeduc[Tr==1], dta2$dmeduc[Tr==0], ylim=c(0,15), xlim=c(0,15)) abline(0, 1, col=2) qqplot(dta2$dmeduc[Tr==1], dta2$dmeduc[rr.match1$matched.control.sample], ylim=c(0,15), xlim=c(0,15)) abline(0, 1, col=2) # And to OLS controlling for mother's education summary(lm(y~Tr+x)) # And to matching using Match() rr.match2<- Match(Y=y, Tr=Tr, X=x,estimand="ATT") summary(rr.match2) MatchBalance(Tr~x, match.out=rr.match2) # Too many ties==> binary variables ###################################### SKIP ############################################# # We can reduce the number of controls ==> Sample by treatment-control dta3<-do.call("rbind", lapply(split(dta, dta$tobacco), function(x) x[sample(nrow(x), 1000, replace=FALSE),])) # So now we have 1000 treatments and 1000 controls rr.match3<- Match(Y=dta3$dbirwt, Tr=dta3$tobacco, X=dta3$dmeduc,estimand="ATT") summary(rr.match3) MatchBalance(Tr~x, match.out=rr.match3) # This takes for ever, why? Becuase there are so many controls # Or let's take 1000 treatments and 2000 controls dta4<-rbind(subset(dta,dta$tobacco==1)[sample(sum(dta$tobacco==1), 1000, replace=FALSE),],subset(dta,dta$tobacco==0)[sample(sum(dta$tobacco==0), 2000, replace=FALSE),]) dim(dta4) # So now we have 1000 treatments and 2000 controls rr.match4<- Match(Y=dta4$dbirwt, Tr=dta4$tobacco, X=dta4$dmeduc,estimand="ATT") summary(rr.match4) MatchBalance(Tr~x, match.out=rr.match4) # Or we can add a random covariate x2<-rnorm(length(x)) X<-data.frame(x,x2) Match(Y=dta2$dbirwt, Tr=dta2$tobacco, X=X,estimand="ATT") ###################################### SKIP ############################################# # And Propensity Score Tr<-dta2$tobacco x<-dta2$dmeduc y<-dta2$dbirwt pscore<-glm(Tr~x,family=binomial(link=logit)) phat<-pscore$fitted.values etahat<-pscore$linear.predictors par(mfrow=c(1,2)) boxplot(phat~Tr,range=5) summary(lm(y~Tr+x)) summary(lm(y~Tr+phat)) rr.matchpscore<-meanmatch(dtreat=Tr,x=phat) t.test(dta2$dmeduc[Tr==1],dta2$dmeduc[Tr==0]) t.test(dta2$dmeduc[Tr==1],dta2$dmeduc[rr.matchpscore$matched.control.sample]) # Balancing the pscore balanced education!!!!! # But the idea of propensity score is to reduce dimensionality # So let's include a lot of covariates Tr<-dta2$tobacco y<-dta2$dbirwt pscore<-glm(Tr~dta2$dmeduc+dta2$dmage+dta2$mblack+dta2$dmar+dta2$dfage+dta2$drink+dta2$tripre1+dta2$tripre2+dta2$deadkids+dta2$phyper+dta2$diabete,family=binomial(link=logit)) phat2<-pscore$fitted.values etahat2<-pscore$linear.predictors par(mfrow=c(1,2)) boxplot(phat2~Tr,range=5) summary(lm(y~Tr+dta2$dmeduc+dta2$dmage+dta2$mblack+dta2$dmar+dta2$dfage+dta2$drink+dta2$tripre1+dta2$tripre2+dta2$deadkids+dta2$phyper+dta2$diabete)) summary(lm(y~Tr+phat2)) rr.matchpscore2<-meanmatch(dtreat=Tr,x=phat2) t.test(dta2$dmeduc[Tr==1],dta2$dmeduc[Tr==0]) # t.test(dta2$dmeduc[Tr==1],dta2$dmeduc[rr.matchpscore2$matched.control.sample]) t.test(dta2$mblack[Tr==1],dta2$mblack[Tr==0]) # t.test(dta2$mblack[Tr==1],dta2$mblack[rr.matchpscore2$matched.control.sample]) # Use Match() to match on the propensity score rr.match3<- Match(Y=y, Tr=Tr, X=phat2, M=1,estimand="ATT") MatchBalance(Tr~dta2$dmeduc+dta2$dmage+dta2$mblack+dta2$dmar+dta2$dfage+dta2$drink+dta2$tripre1+dta2$tripre2+dta2$deadkids+dta2$phyper+dta2$diabete, match.out=rr) par(mfrow=c(1,2)) qqplot(dta2$dmeduc[Tr==1], dta2$dmeduc[Tr==0], ylim=c(0,15), xlim=c(0,15)) abline(0, 1, col=2) qqplot(dta2$dmeduc[rr.match3$index.treated], dta2$dmeduc[rr.match3$index.treated], ylim=c(0,15), xlim=c(0,15)) abline(0, 1, col=2) # It works!!! t.test(dta2$dmeduc[Tr==1],dta2$dmeduc[Tr==0]) # t.test(dta2$dmeduc[rr.match3$index.treated],dta2$dmeduc[rr.match3$index.control]) t.test(dta2$mblack[Tr==1],dta2$mblack[Tr==0]) # t.test(dta2$mblack[rr.match3$index.treated],dta2$mblack[rr.match3$index.control]) # SEE HOW UNWEIGHTED MEAN IS DIFFERENT! mean(dta2$dmeduc[rr.match3$index.treated]) mean(dta2$dmeduc[Tr==1]) length(dta2$dmeduc[rr.match3$index.treated]) length(dta2$dmeduc[Tr==1])