library(MASS) library(foreign) # Loads the read.*** and write.*** commands ################ BOOTSTRAPPING ################################################### # Function for Bootstrapping OLS (using lm()) boot.ols<-function(dataframe,n.boot) { set.seed(103) #dta<-data.matrix(dataframe) betas.boot<-matrix(nrow=ncol(dta),ncol=n.boot) # I don't substract one because I need an extra column for the constant sd.boot<-matrix(nrow=ncol(dta),ncol=n.boot) for (i in 1:n.boot) { sample.boot<-dta[sample(1:nrow(dta),replace=T),] y.boot<-sample.boot[,1] x.boot<-as.matrix(sample.boot[,2:ncol(dta)]) lmout<-summary(lm(y.boot~x.boot)) betas.boot[,i]<-lmout$coefficients[,1] sd.boot[,i]<-lmout$coefficients[,2] list(betas.boot=betas.boot,sd.boot=sd.boot) } } # DIFFERENT METHODS TO BOOTSTRAP CONFIDENCE INTERVALS ## Generate Data set.seed(101) x1<-rnorm(500,mean=3,sd=1) x2<-rnorm(500,mean=1,sd=1) x3<-rnorm(500,mean=2,sd=1) u<-rnorm(500,mean=0,sd=1) y<-20+2*x1+4*x2+15*x3+u dta<-data.frame(y,x1,x2,x3) lmout<-summary(lm(y~x1+x2+x3)) betas<-lmout$coefficients[,1] std<-lmout$coefficients[,2] # Bootstrap the t-statistic # For every bootstrap sample, I calculate betas and standard errors alpha=0.05 n.boot=1000 out.boot<-boot.ols(dataframe=dta,n.boot=n.boot) out.boot$betas.boot # bootstrapped betas t(out.boot$betas.boot) # transposed bootstrapped betas out.boot$sd.boot # bootstrapped standard errors # Method 1: Calculate confidence intervals using the Percentile t-method # Davisdson &Hinkley, page 29==> studentized bootstrap statistic # Si tiene la misma dimensión de filas, resta columna a columna # Calculate the bootstrapped t-statistics t.boot<-(out.boot$betas.boot-betas)/out.boot$sd.boot quants.t.boot<-apply(t.boot, 1, sort) [c(ceiling((1-alpha/2)*n.boot),ceiling((alpha/2)*n.boot)),] ci.t.boot<-betas-t(quants.t.boot)*std # Method 2: Calculate confidence intervals using the Percentile method (sin dividir por std) diff.boot<-(output.boots$betas.boot-output.ols$betas[,1]) quants.diff.boot<-apply(diff.boot, 1, sort) [c(ceiling((1-alpha/2)*n.boot),ceiling((alpha/2)*n.boot)),] ci.diff.boot<-output.ols$betas[,1]-t(quants.diff.boot) # Method 3: Calculate confidence intervals using the Efron's percentile method quants.beta.boot<-apply(out.boot$betas.boot, 1, sort) [c(ceiling((alpha/2)*n.boot),ceiling((1-alpha/2)*n.boot)),] ci.beta.boots<-t(quants.beta.boot) apply(t(output.boots$betas.boot), 2,quantile) # BOOTSTRAPPING HYPOTHESIS TESTS # Example 0: mean x<-rnorm(1000,mean=1,sd=1) nboot<-1000 # Null mu=0.9 xnull<-x-mean(x)+0.9 tstat<-mean(x) tboot0<-numeric(nboot) set.seed(101) for(i in 1:nboot) { sboot<-sample(xnull,replace=T) tboot0[i]<-mean(sboot) } pboot0<-length(which(tboot0>=tstat))/nboot # Random variable with mean 0.9 has mean greater than tstat 0.6% of times # Example 1: difference in means z0<-c(82,79,81,79,77,79,79,78,79,82,76,73,64) z1<-c(84,86,85,82,77,76,77,80,83,81,78,78,78) # Null hypothesis mu0=mu1 # Test statistic ==> t=zbar0-zbar1 # Under null hypothesis, two distributions are the same==> sample jointly Z<-c(z0,z1) D<-c(rep(0,length(z0)),rep(1,length(z1))) dta<-data.frame(D,Z) tstat<-mean(z1)-mean(z0) nboot<-999 tboot<-numeric(nboot) set.seed(101) for(i in 1:nboot) { sboot<-dta[sample(1:nrow(dta),replace=T),] tboot[i]<-mean(sboot[,2][sboot[,1]==1])-mean(sboot[,2][sboot[,1]==0])-tstat+0 } # Alternative: mu1>mu0 pvalue<-(sum(tboot>tstat)+1)/(nboot+1) pboot<-length(which(tboot>=tstat))/nboot # which() retunrns (1:length(x))[x], i.e. the indices of hte elements that are true in x # Equivalently pboot<-mean(tboot>=tstat) # We must somehow sample from a population for which the null hypothesis is true # Now let's use a pivotal statistic # Observed value of test statistic under null z<-(mean(z1)-mean(z0))/sqrt(var(z1)*1/length(z1)+var(z0)*1/length(z0)) tboot2<-numeric(length(nboot)) set.seed(101) for(i in 1:nboot) { sboot<-dta[sample(1:nrow(dta),replace=T),] z1boot<-sboot[,2][sboot[,1]==1] z0boot<-sboot[,2][sboot[,1]==0] num<-mean(z1boot)-mean(z0boot)-(mean(z1)-mean(z0)) den<-sqrt(var(z1boot)*1/length(z1boot)+var(z0boot)*1/length(z0boot)) tboot2[i]<-(num/den) } pboot2<-length(which(tboot2>=z))/nboot # Example 2:OLS # Null hypothesis: beta1=0 # Test whether beta1 is equal to zero # Alternative hypohtesis: beta1 is positive # We can construct a confidence interval lmout$coefficients[2,1] # beta1 n.boot=999 set.seed(101) results<-numeric(n.boot) for (i in 1:n.boot) { sample.boot<-dta[sample(1:nrow(dta),replace=T),] y.boot<-sample.boot[,1] x.boot<-as.matrix(sample.boot[,2:ncol(dta)]) temp<-summary(lm(y.boot~x.boot)) results[i]<-temp$coefficients[2,1] } alpha=0.05 sort(results)[c(ceiling((alpha/2)*(n.boot+1)),ceiling((1-alpha/2)*(n.boot+1)))] tstat<-lmout$coefficients[2,1] # beta1 # Zero is not inside the interval, so we conclude that beta1 is # significantly different from zero at the 5% level # Alternatively, we can compute the p-value directly using # 1. A resampling from a sample generated by beta1=0 (only feasible becasue we know the DGP) # 2. Studentized bootstrapping ## Generate Data # dta<-data.frame(y,x1,x2,x3) lmout<-summary(lm(y~x1+x2+x3)) betas<-lmout$coefficients[,1] std<-lmout$coefficients[,2] # Bootstrap OLS # y<-20+2*x1+4*x2+15*x3+u alpha=0.05 n.boot=1000 out.boot<-boot.ols(dataframe=dta,n.boot=n.boot) out.boot$betas.boot # bootstrapped betas # t(out.boot$betas.boot) # transposed bootstrapped betas # out.boot$sd.boot # bootstrapped standard errors # Null hypothesis beta1=1.9 t.boot<-(out.boot$betas.boot-betas)/out.boot$sd.boot # Studentized statistic that<-(lmout$coefficients[2,1]-1.9)/lmout$coefficients[2,2] pbootols<-length(which(t.boot[2,]>that))/nboot # Can't reject ######## # Note : Remember how to calculate left, right, adn two-sided pvalues ltpboot<-length(which(resultsH0<=tstat))/nboot utpboot<-length(which(resultsH0>tstat))/nboot # Two tailed p-value pboot<-2*min(ltpboot,utpboot) ######### ################################################################### # Parametric vs Non-Parametric Bootstrap in the presence of outliers ################################################################### set.seed(101) X<-rnorm(1000,mean=0,sd=1) Y<-rnorm(1000,mean=0,sd=1) X<-c(X,40) Y<-c(Y,1000) length(Y) length(X) lmall<-summary(lm(Y~X)) dta<-data.frame(Y,X) lmsub<-summary(lm(Y[1:1000]~X[1:1000])) nonpar<-boot.ols(dta,1000) plot(density(nonpar$betas.boot[2,])) # Beta plot(density(nonpar$betas.boot[1,])) # THe constant # Parametric bootstrap betas.fs<-cbind(as.vector(lmall$coefficients[,1])) length(lmall$residuals) n.boot=1000 Xc<-cbind(1,X) bboot.par<-matrix(nrow=n.boot,ncol=ncol(dta)) for(i in 1:n.boot){ bootresid<-sample(lmall$residuals) yboot<-Xc%*%betas.fs+bootresid bboot.par[i,]<-lm(yboot~X)$coefficients } plot(density(bboot.par[,2])) # Beta # What happens if we reduce the leverage? X<-c(X[1:1000],1) lmall<-summary(lm(Y~X)) dta<-data.frame(Y,X) lmsub<-summary(lm(Y[1:1000]~X[1:1000])) nonpar<-boot.ols(dta,1000) plot(density(nonpar$betas.boot[2,])) # Beta plot(density(nonpar$betas.boot[1,])) # THe constant # Parametric bootstrap betas.fs<-cbind(as.vector(lmall$coefficients[,1])) length(lmall$residuals) n.boot=1000 Xc<-cbind(1,X) bboot.par<-matrix(nrow=n.boot,ncol=ncol(dta)) for(i in 1:n.boot){ bootresid<-sample(lmall$residuals) yboot<-Xc%*%betas.fs+bootresid bboot.par[i,]<-lm(yboot~X)$coefficients } plot(density(bboot.par[,2])) # Beta