########## PS-239 FALL 2006 ############################################# ########## SECTION 6 ############################################# library(MASS) library(foreign) library(Matching) ## Read data dta<-read.dta(file="W:/Teaching/Fall2006_PolSci239/Section_Notes/Section5/smoking2.dta") names(dta) dim(dta) # Check if there are NAs apply(dta,2,function(x) sum(is.na(x))) # Treatment variable: tobacco==> indicator equal to one if mother smokes # Outcome variables: # death==> indicator equal to one if infant died within 1 year of birth # dbirwr==> birth weight of the infant (in grams) # Covariates: dmage, dmeduc, mblack, etc. # Number of treatment observations sum(dta$tobacco) # Number of control observations nrow(dta)-sum(dta$tobacco) # Since the data has too many observations, I take a sample # set.seed(1234) # sample(nrow(dta),5000) # If 'x' has length 1, sampling takes place from '1:x'. # dta2<-dta[sample(nrow(dta),3000),] # Sample the rows of dta and then I index # write.csv(dta2, file = "W:/Teaching/Fall2006_PolSci239/ProblemSets/ProblemSet4/smoking_short.csv", append = FALSE,row.names=FALSE) dta2<-read.csv(file = "W:/Teaching/Fall2006_PolSci239/ProblemSets/ProblemSet4/smoking_short.csv",header=TRUE) dim(dta2) names(dta2) nn.match<-function(x,Tr) { # Define parameters index<-1:length(x) index.t<-index[Tr==1] index.c<-index[Tr==0] Nt=length(x[Tr==1]) xt<-x[Tr==1] xc<-x[Tr==0] sample.match<-NULL for (i in 1:Nt) { # Check if there are ties dis<-abs(xt[i]-xc) # Calculate the absolute value of the distance between xt[i] and every xc dta<-cbind(index.c,xc,dis) dta<-dta[order(dis),] nties=1 for (j in 1:(length(dis)-1)) { # Subtract one because I'm comparing j with j+1 and element after-last does not exist! if (dta[j,3]==dta[j+1,3]) nties=nties+1 else break } # Now nmatchs has the number of nearest matches that I found for x[i] w<-1/nties xc.add<-cbind(index.t[i],xt[i],dta,w)[1:nties,] sample.match<-rbind(sample.match,xc.add) } colnames(sample.match)=c("index.tr","xt","index.co","xc","dis","wts") list(sample.match=sample.match,index.tr=sample.match[,1],index.co=sample.match[,3],wts=sample.match[,6]) } # End function # Define treatment and a covariate y<-dta2$dbirwt Tr<-dta2$tobacco x<-dta2$dmeduc # Use the function to match on mother's education match1<-nn.match(x=x,Tr=Tr) summary(match1) dim(match1$sample.match) # Average Treatment effect with no matching mean(y[Tr==1])-mean(y[Tr==0]) # ATT att<-weighted.mean(y[match1$index.tr],w=match1$wts)-weighted.mean(y[match1$index.co],w=match1$wts) weighted.mean(y[match1$index.tr],w=match1$wts,na.rm=T)-weighted.mean(y[match1$index.co],w=match1$wts,na.rm=T) # The estimate goes down in absolute value ==> consistent wiht the bias # Wrong way to do it mean(y[match1$index.tr],na.rm=T)-mean(y[match1$index.co],na.rm=T) # Is education now balanced? weighted.mean(dta2$dmeduc[match1$index.tr],w=match1$wts,na.rm=T) # Mean for treated weighted.mean(dta2$dmeduc[match1$index.co],w=out$wts,na.rm=T) # Mean for control weighted.mean(dta2$dmeduc[match1$index.tr],w=match1$wts,na.rm=T)-weighted.mean(dta2$dmeduc[match1$index.co],w=out$wts,na.rm=T) # Compare with Match() function match2<- Match(Y=dta2$dbirwt,Tr=dta2$tobacco, X=dta2$dmeduc, M=1, estimand = "ATT") summary(match2) # VOILA! THE RESULTS ARE THE SAME!!!! # Actually, MatchBalance() also shows us that the balance on dmeduc is complete MatchBalance(Tr~dta2$dmeduc, match.out=match2,ks=FALSE) # The function nn.match completely reproduces the outcome of Match() for UNIVARIATE matching # So now we understand exactly what we are doing when we call Match() # Define covariates # covariates<-data.frame(dta2$dmage,dta2$dmeduc,dta2$dmar,dta2$dlivord,dta2$nprevist,dta2$disllb,dta2$dfage,dta2$dfeduc,dta2$anemia,dta2$diabete,dta2$phyper,dta2$pre4000,dta2$preterm,dta2$alcohol,dta2$drink,dta2$foreignb,dta2$plural,dta2$deadkids,dta2$mblack,dta2$motherr,dta2$mhispan,dta2$fblack,dta2$fotherr,dta2$fhispan,dta2$tripre1,dta2$tripre2,dta2$tripre3,dta2$tripre0,dta2$first) covariates<-data.frame(dta2$dmage,dta2$dmeduc,dta2$dmar,dta2$dlivord,dta2$nprevist,dta2$dfage,dta2$dfeduc,dta2$anemia) # Define treatment and outcome Tr<-dta2$tobacco y<-dta2$dbirwt # 1. Difference in means on these covariates # Diff-in-means before matching t.test(dta2$dmage[Tr==1],dta2$dmage[Tr==0]) # We can use apply to do all at the same time apply(covariates,2,function(x) t.test(x[Tr==1],x[Tr==0])) # Or apply(covariates[Tr==1,],2,mean) apply(covariates[Tr==0,],2,mean) # Variance ratios apply(covariates,2,function(x) var(x[Tr==1])/var(x[Tr==0])) # 2. Match on education match1<- Match(Tr=Tr, X=dta2$dmeduc,estimand="ATT",version="fast") summary(match1) MatchBalance(Tr~dta2$dmage+dta2$dmeduc+dta2$dmar+dta2$dlivord+dta2$nprevist+dta2$dfage+dta2$dfeduc+dta2$anemia,match.out=match1,ks=FALSE) # Means after matching t.test(dta2$dmage[match1$index.treated],dta2$dmage[match1$index.control]) # This is wrong because we need to reweight the data # These two means are equal!!!! And these are the correct treatment means mean(dta2$dmage[Tr==1]) wmean.treat<-weighted.mean(x=dta2$dmage[match1$index.treated],w=match1$weights) # But you don't get the right result if you do mean(dta2$dmage[match1$index.treated]) # The correct control mean after matching is wmean.cont<-weighted.mean(x=dta2$dmage[match1$index.control],w=match1$weights) # The weighted means are the means that are reported by the function MatchBalance # These two variances are different because we are not weighting the data var.ratio1<-var(dta2$dmage[Tr==1])/var(dta2$dmage[Tr==0]) var.ratio2<-var(dta2$dmage[match1$index.treated])/var(dta2$dmage[match1$index.control]) # Weighted variance formula # sum[ wi (xi-xbar.wt)^2]/[(n-1) wbar] # where wi is weigh of observation i, x.bar.wt is the weighted mean of x, and wbar is the mean of the weights wvar.treat<-sum(match1$weights*(dta2$dmage[match1$index.treated]-wmean.treat)^2)/length(match1$weights-1)*mean(match1$weights) wvar.cont<-sum(match1$weights*(dta2$dmage[match1$index.control]-wmean.cont)^2)/length(match1$weights-1)*mean(match1$weights) var.ratio2<-wvar.treat/wvar.cont # Therefore the correct variance ratios are var.ratio1<-var(x=dta2$dmage[Tr==1])/var(dta2$dmage[Tr==0]) wvar.treat<-weighted.var(x=dta2$dmage[match1$index.treated],w=match1$weights) wvar.cont<-weighted.var(x=dta2$dmage[match1$index.control],w=match1$weights) var.ratio2<-as.numeric(wvar.treat)/as.numeric(wvar.cont) # Function to calculate the weighted variance weighted.var<-function(x,w) { x.wmean<-weighted.mean(x=x,w=w) w.mean<-mean(w) n<-length(x) wvar<-sum(w*(x-x.wmean)^2)/(n-1)*w.mean list(wvar=wvar) } results<-matrix(ncol=3,nrow=ncol(covariates)+1) vars<-c() colnames(results)<-c("Mean treated","Mean control","Difference") rownames(results)<-names(covariates)[1:9] for (i in 1:ncol(covariates)) { results[i,1]<-weighted.mean(x=covariates[,i][match1$index.treated],w=match1$weights) results[i,2]<-weighted.mean(x=covariates[,i][match1$index.control],w=match1$weights) results[i,3]<-results[i,1]-results[i,2] } apply(covariates[match1$index.treated,],2,function(x) weighted.mean(x,match1$weights)) apply(covariates[match1$index.control,],2,function(x) weighted.mean(x,match1$weights)) par(mfrow=c(1,2)) qqplot(dta2$dmage[Tr==1], dta2$dmage[Tr==0], ylim=c(0,15), xlim=c(0,15)) abline(0, 1, col=2) qqplot(dta2$dmage[match1$index.treated], dta2$dmage[match1$index.control], ylim=c(0,15), xlim=c(0,15)) abline(0, 1, col=2) # 3. Propensity score pscore1<-glm(Tr~dta2$dmeduc,family=binomial(link=logit)) phat1<-pscore1$fitted.values etahat1<-pscore1$linear.predictors pscore2<-glm(Tr~dta2$dmeduc+dta2$dmage+dta2$mblack+dta2$dmar+dta2$dfage+dta2$drink+dta2$tripre1+dta2$tripre2+dta2$deadkids+dta2$phyper+dta2$diabete,family=binomial(link=logit)) phat2<-pscore2$fitted.values etahat2<-pscore2$linear.predictors par(mfrow=c(1,2)) boxplot(phat1~Tr,range=5) boxplot(phat2~Tr,range=5) # 4. Match on the longest propensity score match2<- Match(Tr=Tr, X=phat2, M=1) MatchBalance(Tr~dta2$dmeduc+dta2$dmage+dta2$mblack+dta2$dmar+dta2$dfage+dta2$drink+dta2$tripre1+dta2$tripre2+dta2$deadkids+dta2$phyper+dta2$diabete, match.out=match2,ks=FALSE) # 5. Estimate ATT match2<- Match(Y=y,Tr=Tr, X=phat2, M=1) summary(match2) ols<-summary(lm(y~Tr+dta2$dmeduc+dta2$dmage+dta2$mblack+dta2$dmar+dta2$dfage+dta2$drink+dta2$tripre1+dta2$tripre2+dta2$deadkids+dta2$phyper+dta2$diabete))