###################################################################### ################ PS-239 - Section 8 - Bootstrapping ################## ###################################################################### library(MASS) library(foreign) #First, let's understand what sample() does help(sample) # sample() takes a sample of the specified size from the elements of x using either with or without replacement. # By default, when we do sample(x), 'size' is equal to 'length(x)' so that 'sample(x)' # generates a random permutation of the elements of 'x' (or '1:x') # So if x=[2,5,8,9,21], length(x)=5, sample(x) takes a random sample of size 5 from the elements of x x<-c(2,5,8,9,21) x sample(x) sample(x, replace=TRUE) # When we sample with replacement, not all original observations are in every sample # Now to bootstrapping ###################################################################### # Example 1 ==> We want to know the the median of some data ######################################################################### # Natural estimator: sample median # How variable is this estimator? # Large sample theory ==> median is asymptotically normal with mean=m and var=1/4f(m)^2 # But in order to calculate the var(m) we need to know the density at the median, i.e. f(m) # f(m) is unknown, so what do we do? Bootstrap! # Let's use the galaxies data data(galaxies) galaxies<-galaxies/1000 mean(galaxies) median(galaxies) # How variable is this estimator of the median? help(density) # y gives us the estimated density values #density(x)$y # Calculate the variance of sample median analytically (well, using the estimated density) # Estimated density at median(galaxies) est.den<-density(galaxies,n=1,from=20.833,to=20.834)$y est.den # Analytical variance and standard error of sample median an.var<-1/(4*length(galaxies)*est.den^2) an.se<-sqrt(1/(4*length(galaxies)*est.den^2)) # But now we can bootstrap it set.seed(101) # Always set your seed to be able to replicate in the future nboot<-1000 # Number of bootstrap samples that we'll take results<-numeric(nboot) # Create an empty vector: "numeric(length=l)" creates a real vector of length l # The elements of the #vector are all equal to '0' # Now let's bootstrap for(i in 1:nboot){ bootsample<-sample(galaxies,size=length(galaxies),replace=TRUE) results[i]<-median(bootsample) } length(results) # This has all our estimated medians # Compare median(galaxies) # Full sample estimator of the median mean(results) # Bootstrap estimator of the median # Estimate the bias # results-median(galaxies) # Vector that contains by how much each bootstrap estimate differs from full sample estimate bias<-mean(results-median(galaxies)) # This is the bias of the estimator of the median, i.e. of the sample median bias # It's a small bias compared to the units of these data # Estimate the standard error of the sample median se<-sqrt(var(results)) # Compare to the analytical result se an.se # Pretty close truehist(results,h=0.01) # Does not look very normal # How would we get the confidence intervals for the sample median? quantile(results,p=c(0.025,0.975)) median(galaxies) se bias median(galaxies)/sqrt(var(results)) library(boot) set.seed(101) #boot(galaxies,function(x,i) median(x[i]),R=1000) ###################################################################### # Example 2 ==> Bootstrap the mean ######################################################################### set.seed(101) # Always set your seed to be able to replicate in the future nboot<-1000 # Set the number of repetitions results<-numeric(length=nboot) for (i in 1:nboot) { bootsample<-sample(galaxies,replace=TRUE) results[i]<-mean(bootsample) } mean(galaxies) # Full sample estimator mean(results) # Bootstrap estimator # They are similar, as they should be sqrt(var(results)) # Bootstrapped standard error sqrt(var(galaxies)/length(galaxies))# Standard error of sample mean using formula # They are similar, as they should be ###################################################################### # Example 3 ==>NON-PARAMETRIC BOOTSTRAP OF OLS ######################################################################### library(foreign) # Loads the read.*** and write.*** commands ################ FUNCTIONS ################################################### # OLS function ols<-function(y,x) { n=nrow(x) k=ncol(x) beta.hats<-ginv(t(x)%*%x)%*%t(x)%*%y residu<-y-x%*%beta.hats sigma2<-(t(residu)%*%residu)/(n-k) varcovar<-sigma2[1,1]*(solve(t(x)%*%x)) std.dev<-sqrt(diag(varcovar)) list(betas=beta.hats,std.dev=std.dev) } # Bootstrapping of OLS boots<-function(dataframe,num.rep) { set.seed(103) data.as.matrix<-data.matrix(dataframe) betas.star<-matrix(nrow=ncol(data.as.matrix),ncol=num.rep) # I don't substract one because I need an extra column for the constant sd.star<-matrix(nrow=ncol(data.as.matrix),ncol=num.rep) for (i in 1:num.rep) { sample.star<-data.as.matrix[sample(1:nrow(data.as.matrix),replace=T),] y.star<-sample.star[,1] x.star<-cbind(1,sample.star[,2:ncol(data.as.matrix)]) betas.star[,i]<-ols(y.star,x.star)$betas sd.star[,i]<-ols(y.star,x.star)$std.dev list(betas.star=betas.star,sd.star=sd.star) } } ################################################################### dta<-read.dta(file="W:/Teaching/Fall2006_PolSci239/ProblemSets/ProblemSet1/ME_households.dta") attach(dta) # Replace NA values by the median in the variables I will use later apply(dta,2,function(x) sum(is.na(x))) educ.hh [is.na(educ.hh )]<-floor(median(educ.hh ,na.rm=T)) educ.sp [is.na(educ.sp)]<-floor(median(educ.sp,na.rm=T)) summary(dta) names(dta) # OLS estimation # 1) With lm lmout<-summary(lm(sum.ds~tra.con+hhsize+educ.hh+educ.sp)) lmout$coefficients ## 2) With a hand-made function # This function needs matrix inputs ols.mat<-as.matrix(cbind(sum.ds,tra.con,hhsize,educ.hh,educ.sp)) dim(ols.mat) ols.mat<-ols.mat[!is.na(sum.ds),] dim(ols.mat) y<-ols.mat[,1] x<-cbind(1,ols.mat[,2:ncol(ols.mat)]) # Remember to include the constant data<-cbind(y,x) output.ols<-ols(y,x) cbind(output.ols$betas,output.ols$std.dev) lmout$coefficients # Let's do some bootstrapping set.seed(101) nboot<-1000 bboot<-matrix(nrow=nboot,ncol=ncol(data)-1) dim(bboot) for(i in 1:nboot){ bootsample<-data[sample(1:nrow(data),replace=TRUE),] # Note that now I sample the indices yboot<-bootsample[,1] xboot<-bootsample[,2:ncol(data)] bboot[i,]<-t(ols(yboot,xboot)$betas) } apply(bboot,2,mean) # Bootstrap estimators t(output.ols$betas) # Full sample estimators # They are very similar, as they should be # Bootstrapped standard errors apply(bboot,2,function(x) sqrt(var(x))) # Analytical standard errors output.ols$std.dev # Very nice: they are really similar #Bootstrapped confidence intervals help(sort) apply(bboot,2,function(x) quantile(x,p=c(0.025,0.975))) # Compare these CI with the estimated betas t(output.ols$betas) # Full sample estimators # Alternatively alpha<-0.05 # Significance level q025<-ceiling(nboot*(alpha/2)) # 0.025 quantile q975<-ceiling(nboot*(1-alpha/2)) # 0.975 quantile q025 q975 apply(bboot,2,function(x)sort(x))[c(q025,q975),] # Compare with apply(bboot,2,function(x) quantile(x,p=c(0.025,0.975))) # The differences are due to the approximations used by the quantile()function # Finally, let's look at the bias diff<-matrix(nrow=nboot,ncol=5) for(i in 1:5){ diff[,i]<-bboot[,i]-output.ols$betas[i] } biasols<-apply(diff,2,mean) biasols # Very small bias! ################################################################################ ## SKIP THIS FOR NOW: DIFFERENT BOOTSTRAP METHODS ################################################################################ alpha=0.05 num.rep=1000 output.boots<-boots(dataframe=data.frame(shfirmfloor,treatpf.all,hhsize.room,educ.hh,educ.sp),num.rep=num.rep) # Method 1: Percentile t-method #t.star1<-(output.boots$betas.star-matrix(data=t(output.ols$betas),nrow=num.rep, ncol=nrow(output.ols$betas),byrow=T))/output.boots$sd.star # Ojo, las restas que no conforman solo funcionan por vector columna ==> Si tiene la misma dimensión de filas, resta columna a columna t.star<-(output.boots$betas.star-output.ols$betas[,1])/output.boots$sd.star quants.t.star<-apply(t.star, 1, sort) [c(ceiling((1-alpha/2)*num.rep),ceiling((alpha/2)*num.rep)),] ci.t.star<-output.ols$betas[,1]-t(quants.t.star)*output.ols$std.dev # Method 2: Percentile method (sin dividir por std) diff.star<-(output.boots$betas.star-output.ols$betas[,1]) quants.diff.star<-apply(diff.star, 1, sort) [c(ceiling((1-alpha/2)*num.rep),ceiling((alpha/2)*num.rep)),] ci.diff.star<-output.ols$betas[,1]-t(quants.diff.star) # Method 3: Efron's percentile method quants.beta.star<-apply(output.boots$betas.star, 1, sort) [c(ceiling((alpha/2)*num.rep),ceiling((1-alpha/2)*num.rep)),] ci.beta.boots<-t(quants.beta.star) apply(t(output.boots$betas.star), 2,quantile) ################################################################################ ## PARAMETRIC BOOTSTRAPPING ################################################################################ lmout2<-lm(sum.ds~tra.con+hhsize+educ.hh+educ.sp) betas.fs<-cbind(as.vector(lmout2$coefficients)) length(lmout2$residuals) bboot.par<-matrix(nrow=nboot,ncol=ncol(data)-1) for(i in 1:nboot){ bootresid<-sample(lmout$residuals,replace=TRUE) yboot<-x%*%betas.fs+bootresid bboot.par[i,]<-t(ols(yboot,x)$betas) } apply(bboot,2,mean) # Non-parametric Bootstrap estimators apply(bboot.par,2,mean) # Parametric Bootstrap estimators t(output.ols$betas) # Full sample estimators # They are very similar, as they should be # Parametric Bootstrapped standard errors apply(bboot.par,2,function(x) sqrt(var(x))) # Analytical standard errors output.ols$std.dev # Very nice: they are really similar #Parametrically Bootstrapped confidence intervals help(sort) apply(bboot.par,2,function(x) quantile(x,p=c(0.025,0.975))) # Compare these CI with the estimated betas t(output.ols$betas) # Full sample estimators # Alternatively alpha<-0.05 # Significance level q025<-ceiling(nboot*(alpha/2)) # 0.025 quantile q975<-ceiling(nboot*(1-alpha/2)) # 0.975 quantile q025 q975 apply(bboot.par,2,function(x)sort(x))[c(q025,q975),] # Compare with apply(bboot.par,2,function(x) quantile(x,p=c(0.025,0.975))) # The differences are due to the approximations used by the quantile()function # Finally, let's look at the bias diff<-matrix(nrow=nboot,ncol=5) for(i in 1:5){ diff[,i]<-bboot.par[,i]-output.ols$betas[i] } biasols.par<-apply(diff,2,mean) biasols.par # Very small bias! ################################################################################ ## MONTECARLO ################################################################################ # Example of Monte Carlo simulations # Example taken from Tom Huber, Physics Department, Gustavus Adolphus College. (http://physics.gac.edu/~huber/envision/instruct/montecar.htm) # Square that has one corner at the origin of a coordinate system and has sides of length 1 ==>area=1 # Inscribe a quarter of a circle with radius inside this square ==> area=pi/4. # We can do a Monte Carlo simulation to find the relative area of the circle and square and then multiply the circle's area by 4 to find pi. # For a point (X,Y) to be inside of a circle of radius 1, its distance from the origin (X2+Y2) will be less than or equal to 1. # We can generate thousands of random (X,Y) positions and determine whether each of them are inside of the circle. # Each time it is inside of the circle, we will add one to a counter. # After generating a large number of points, the ratio of the number of points inside the circle to the total number of points # generated will approach the ratio of the area of the circle to the area of the square. # Therefore pi is approximately equal to 4*(Number of Points inside circle)/ (Total Points Generated) nsimul<-100 cum<-0 for (i in 1:nsimul) { x<-runif(1) y<-runif(1) distan<-sqrt(x^2+y^2) if (distan<=1) cum<-cum+1 } 4*cum/nsimul # Using matrices nsimul<-100000 cum<-rep(FALSE,nsimul) x<-runif(nsimul) y<-runif(nsimul) distan<-sqrt(x^2+y^2) cum[(distan<=1)]<-TRUE 4*sum(as.numeric(cum))/nsimul