############################################# example.R ################################################# # # daniel almirall (dalmiral@umich.edu), sunil mithas # posted july 05, 2006 at the university of michigan # ######################################################################################################### # # this is R code that accompanies the following journal article: # # Mithas, S., Almirall, D. and Krishnan, M.S. (2006). # "Do CRM Systems Cause One-to-One Marketing Effectiveness?" # Statistical Science. # # 1. please read the file sts2006_functions.R first # this file contains the functions used in this example. you can copy the # code from this file and paste into the Rgui to run it. you should do this line by line # if you are an R beginner. # # 2. this file provides an example using the functions in sts2006_functions.R # using hypothetical data. the data used below is not real data. # # 3. R's current directory must be the same directory # in which the hypothetical data set data.txt and the file sts2006_functions.R reside. # you can change R's current directory under File --> Change dir... # ######################################################################################################### ###################################### preliminary analyses ############################################# ## these are typical analyses that you will carry out prior to using the ## functions supplied in sts2006_functions.R. ## the preliminary analyses shown here are merely illustrative. hypothetical data ## is used in this example. in a real data analysis, you will want to be more careful ## and thoughtful about your choice of propensity score model and your choice of ns ## the number of subclassifications along the propensity score range. there are other ## considerations, as well. please consult the propensity score literature for more information. ## this file and these functions do not help you choose a propensity score model. ## load the functions into R source(file="sts2006_functions.R") ## begin sinking to an output file #sink(file="example_out.txt") cat("\nan example analysis using the functions in sts2006_functions.R\n") ## load the hypothetical data set # dat has 3 covariates x1, x2, x3 # x1 and x2 are continuous, x3 is binary # a binary variable z denoting the putative cause # z=1 is the treatment group # z=0 is the control group # and a binary variable y denoting the outcome dat <- read.table("data.txt",header=T) ## what is the size of the data set? cat("\n\nprint first ten and last ten data values: n =",nrow(dat),"\n") print(rbind(dat[1:10,],rep(".",5),dat[504:514,])) ## obtain a summary of the data cat("\n\nsummary of the data set\n") print(summary(dat)) ## run a logistic regression of z on x1, x2, x3 ## this is an example of a propensity score model glm1 <- glm(z ~ x1 + x2 + x3, family=binomial, data=dat) ## obtain a summary of the logistic regression or propensity score model cat("\n\nsummary of the propensity score model\n") print(summary(glm1)) ## obtain the propensity score from the logistic regression model shown above. ## this is the predicted probability Pr(z=1|x1,x2,x3) using the propensity score model pscore <- predict(glm1, type = "response") ## take a look at the distribution of the propensity score for the z=1 and z=0 groups seperately cat("\n\nplotting distribution of the propensity score for each z group...\n") par(mfrow=c(2,1)) hist(pscore[dat$z==1], main="propensity score for treated group") hist(pscore[dat$z==0], main="propensity score for control group") ## for illustrative purposes only, break the propensity score into ## subclasses. in a real data analysis, this would ## require much more thought, practice and iteration ## in particular, the strata should be chosen based on some measure of balance ## of the covariate across the two treatment groups z=1 and z=0. subclass <- cut(x=pscore,breaks=c(0,0.4,0.6,0.8,1.0),include.lowest = T) ## take a look at the frequency of firms in each subclass cat("\n\nfrequency of firms by z within each subclass along the propensity score\n") cat("more thoughtful work could be done here when choosing propensity score strata\n") dat$subclass <- subclass print(ftable( dat$subclass,dat$z )) dat$s2 <- as.numeric(subclass) ############################### using the functions in sts2006_functions.R ############## ## this is an illustration of how to use the functions supplied in the file ## sts2006_functions.R. ## estimate \delta using a naive estimator: the difference in proportions in the two groups ## this estimator does not account for confounding by x1, x2, x3 cat("\n\nusing the ace.mle() function: an unadjusted/naive estimator \n") cat("and standard error of \delta = Pr( r(1)=1 ) - Pr( r(0)=1 )\n") cat("this analysis suggests no significant difference\n\n") print( ace.mle1 <- ace.mle(n=nrow(dat),z=dat$z,r=dat$y,ns=4,s=rep(1,nrow(dat)) ) ) ## confidence interval for the unadjusted mean difference cat("\n95% confidence interval for the unadjusted mean difference\n") lb <- ace.mle1$ace - 1.96*ace.mle1$sd.ace ub <- ace.mle1$ace + 1.96*ace.mle1$sd.ace print(round(c(lb,ub),2)) ## estimate \delta using a propensity score stratification: the difference in proportions in the two groups ## this estimator accounts for confounding by x1, x2, x3 using the strata defined by the propensity score cat("\n\nusing the ace.mle() function: a propensity score stratified estimator \n") cat("and standard error of \delta = Pr( r(1)=1 ) - Pr( r(0)=1 )\n") cat("this analysis suggests a significant 14 percentage point causal effect\n\n") print( ace.mle2 <- ace.mle(n=nrow(dat),z=dat$z,r=dat$y,ns=4,s=dat$s2 ) ) ## confidence interval for the propensity score adjusted mean difference cat("\n95% confidence interval for the propensity score adjusted mean difference\n") cat("this ci may be too small since it does not take into \n") cat("account the error in estimating the propensity score\n") lb <- ace.mle2$ace - 1.96*ace.mle2$sd.ace ub <- ace.mle2$ace + 1.96*ace.mle2$sd.ace print(round(c(lb,ub),2)) ## a sample sensitivity analysis using the sens.ace.mle function ## one can easily consider more extreme values for the sa parameters cat("\n\nan example sensitivity analysis using the sens.ace.mle() function\n") cat("at various levels of alpha, delta0, delta1, and pi \n") cat("see the paper RR83 for more details\n") oddvals <- c(1/2,2) pivals <- c(0.1, 0.9) gathr <- NULL cnt <- 0 for (i in oddvals){ for (j in oddvals){ for (k in oddvals){ for (l in pivals){ cnt <- cnt + 1 sares <- sens.ace.mle(n=nrow(dat),z=dat$z,r=dat$y,ns=4,s=dat$s2, alpha = i, delta = c(j,k), pii = l) gathr <- rbind( gathr, c(c(i,j,k,l),sares) ) }}}} attr(gathr,"dimnames")[[2]][1:4] <- c("alpha","delta0","delta1","pi") print(round(gathr,4)) ## balance diagnositcs using R01's methodology before stratification cat("\n\nbalance diagnostic statistics before stratification\n\n") print(rubinbef <- rubinhsorm2001(pscore,dat,"z",c("x1","x2","x3"))) ## balance diagnositcs using R01's methodology within strata and after stratification ## loop over the four strata and report the diagnostic statistics cat("\nbalance diagnostic statistics within each strata\n") bresults <- vector("list",4) # balance results ess <- NULL for (i in 1:4){ cat("\n******************************** strata",i,"********** \n\n") restmp <- rubinhsorm2001(pscore[dat$s2==i],subset(dat,dat$s2==i),"z",c("x1","x2","x3")) print(restmp) bresults[[i]] <- restmp esstmp <- restmp$essentials ess <- rbind(ess,esstmp) rownames(ess) <- paste("strata",1:i) print(esstmp) } ## balance diagnostic statistics overall summary cat("\n\nbalance diagnostic statistics overall (before and after) final summary \n") wts <- table(dat$s2)/nrow(dat) overall <- apply(ess, MARGIN=2, FUN=function(x) { sum(x*wts) }) print(ess <- rbind(before=rubinbef$essentials,ess,after=overall) ) cat("\n\nend of file\n") ## stop sinking output to example_out.txt #sink() # # eof