*RDiT Simulations
*Hausman and Rapson, ARRE, "Regression Discontinuity in Time: Considerations for Empirical Applications"

//cd here
global imagespath "../../Paper\imagefiles\"
//findit vincenty
clear all
version 13
version 13: set seed 100313 //todays date 100313
	//random things: monitor selection, treatment start dates
	//re-setting would lead to costly data work, because this selects which states we use for hourly data

*-----------------------------------*
*----------Random Selections--------*
*-----------------------------------*
****OZONE DATA
*We use ozone data from Auffhammer and Kellogg 2009 
	* https://www.aeaweb.org/articles?id=10.1257/aer.101.6.2687
*We drop California stations, since we know there are confounders there
*Following AK2009, we only keep monitors with a certain percentage of valid observations
*Also, we only keep monitors that are open for the whole sample
use AER20090377_FinalData, clear
drop if state_code==6 //california
drop if valid<9 //following AK. At least 9 valid observations for the period between 9 am and 9 pm each day.
keep state_code county_code site_id ozone_max day month year Date TempMax TempMin Rain Snow
tostring state county site, replace
	replace state="0"+state if length(state)==1
	replace county="00"+county if length(county)==1
	replace county="0"+county if length(county)==2
	replace site_id="000"+site_id if length(site_id)==1
	replace site_id="00"+site_id if length(site_id)==2
	replace site_id="0"+site_id if length(site_id)==3
gen monitorstring=state+county+site
rename Date date
gen pollutant="ozone"
rename ozone_max ozone
*candidate monitors: open for 18 years and have no two-month gaps (count: 110)
	sort monitorstring date
	gen temp=date-date[_n-1] if monitorstring==monitorstring[_n-1]
	bysort monitorstring: egen temp2=max(temp)	
	bysort monitorstring: egen mindate=min(date)
	bysort monitorstring: egen maxdate=max(date)
	gen length=maxdate-mindate
	codebook monitorstring if temp2<60 & length>16*365
	keep if temp2<60 & length>16*365
	drop temp*
*random selection of monitors
*this is not currently needed, because we use all monitors
	sort monitorstring date
	version 13: gen temp1=runiform()
	gen temp2=1 if monitorstring!=monitorstring[_n-1]
	gen temp3=temp1 if temp2==1
	egen temp4=rank(temp3)
	bysort monitorstring: egen monitor=max(temp4)
	drop temp1-temp4
compress
save counterfactual_data_ak2009, replace
version 13: display c(seed)
*X2479360ac069ac80b783ea1dd2d3020300044aec

*****RANDOM START DATES
*10 random treatment start dates, with at least 4 years pre and 4 years post
	/*if not running immediately following previous paragraph, do:
	set seed X2479360ac069ac80b783ea1dd2d3020300044aec
	*/
use counterfactual_data_ak2009, clear
keep date
duplicates drop
version 13: gen temp=runiform()
drop if date<10593+4*365 //min start date, with four years post
drop if date>17166-4*365 //max start date, with four years pre
sort temp
keep if [_n]<=10 //10 random date
keep date
sort date
save start_dates, replace
*12274 12614 12717 12804 13061 13344 13712 14471 14910 15678
version 13: display c(seed)
*X09b8d3186fd92d522a2483c627612e4300040a48

*****NOISE DATA
*adding in noise
*currently using this only for AR(1) simulations.
*For main simulations, we actually need 1000 samples of 1000 monitors each. This is below.
	/*if not running immediately following previous paragraph, do:
	set seed X09b8d3186fd92d522a2483c627612e4300040a48
	*/
use counterfactual_data_ak2009, clear
keep date day month year pollutant
duplicates drop
expand 1000
bysort date: gen monitor=[_n]
version 13: gen ozone=rnormal()
save counterfactual_data_noise, replace
version 13: display c(seed)
*Xc72478be3ee7e07259212ca18b83cf9f00043868

*****RANDOMLY GENERATED AR(1) DATA
*Counterfactual AR(1) data
*one sample of 1000 monitors
use counterfactual_data_noise, clear
	/*if not running immediately following previous paragraph, do:
	set seed Xc72478be3ee7e07259212ca18b83cf9f00043868
	*/
///earlier dates, for burn-in period
	drop month year day ozone pollutant
	expand 2, gen(expand)
	//sum date
	//display r(max)-r(min)+1
	replace date=date-6574 if expand==1
	drop expand
	//burn in didn't look long enough. try again. i now think it was long enough.
	*[update: i know think not necessary. could take out]
	expand 2, gen(expand)
	sum date
	display r(max)-r(min)+1
	replace date=date-13148 if expand==1
	drop expand
xtset monitor date, daily
	gen temp=1
	count if l.temp==.
	count if l.temp==. & date!=-9129
	drop temp //verifying no missing dates
version 13: gen resid=rnormal()
gen post=0
	replace post=1 if date>=13344
//sum date // earliest: -9129	
local beta_true -0.2
foreach auto of numlist 0/9{
	gen counterfactual_`auto'=0 if date==-9129
	replace counterfactual_`auto'=(`auto'/10)*l.counterfactual_`auto'+`beta_true'*post+resid if date>-9129
}
keep if date>=10593
drop resid
compress
save counterfactual_data_autoregress, replace
version 13: display c(seed)
*X55c34156a345ba66e3d084b16d17cc3e000421fd


*---------------------------------------*
*-------Air Quality Summary Stats-------*
*---------------------------------------*
**TABLE: AIR QUALITY SUMMARY STATS, AK DATA
use counterfactual_data_ak2009, clear
	keep if monitor!=89 & monitor!=95 //some of the monitors have too much missing data in the simulations below
	gen TempMean=(TempMin+TempMax)/2
	format %8.3f ozone
	format %8.2f Rain Snow
	format %8.0f TempMean TempMin TempMax  
	//Rain is in 0.01 inches
	//Snow is in inches
	replace Rain=Rain/100 //rain is in inches
	sum ozone TempMin TempMax TempMean Rain Snow, sep(0) format
	codebook date monitor 
	count if ozone==0
		//153 obs, out of 682130

**FIGURE: COUNTERFACTUAL AND TREATMENT EFFECT AT REPRESENTATIVE MONITOR, AK DATA
use counterfactual_data_ak2009, clear
tab monitors if monitor==2
local beta_true = -0.2
local start_date = 13344
gen lnozone=ln(ozone)
*generate treatment and outcome
	gen treatment=0
		replace treatment=1 if date>=`start_date'
	gen outcome=lnozone+`beta_true'*treatment
	replace outcome=. if treatment==0
rename lnozone x
rename outcome y
label variable x "True"
label variable y "With simulated treatment"
scatter x y date if monitor==2 & abs(date-`start_date')<4*365, ///
	msize(small small) mcolor(gs8 black ) msymbol(O Oh ) ///
	xline(`start_date', lcolor(gs8)) ///
	xtitle("") ytitle("Log ambient ozone") ///
	graphregion(color(white))
graph export $imagespath\trueseries`start_date'.eps, as(eps) replace
graph export $imagespath\trueseries`start_date'.png, as(png) width(1800) replace
tab state_c if monitor==2
tab county_c if monitor==2


*-----------------------------------*
*------------Simulations------------*
*---------Defining Programs---------*
*-----------------------------------*
program drop _all

program define RDIT_SIM
if "$data"=="airquality"{
	file open simulate using "results_airquality_$section.csv", write text replace
}
if "$data"=="noise"{
	file open simulate using "results_noise_$section.csv", write text replace
}
file write simulate "data,section,monitor,pollutant,"
file write simulate "beta_true,start_date,"
file write simulate "length,anticipation,expdecayrate,genlogdecay_b_M,phaseinlength,"
file write simulate "estimator,version,bic,r2,n,"
file write simulate "beta_hat,standard_error," 
file write simulate "mintreat,maxtreat" _n

foreach length of numlist $length_list {
foreach anticipation in $anticipation_list{
foreach beta_true in $beta_true_list {
foreach start_date in $start_date_list{
foreach expdecayrate in $expdecayrate_list{
forval b = $genlog_blist{
forval M = $genlog_Mlist{
foreach phaseinlength in $phasein_list{
foreach pollutant in $pollutant_list {

if "$data"=="airquality"{
	use counterfactual_data_ak2009, clear
}	
if "$data"=="noise"{	
	use counterfactual_data_noise, clear //mean zero, standard deviation 1
}	
xtset monitor date, daily	
	
if "$section"=="genlogdecay"{
	local A 0
	local K 1
	local B=-`b'/10000
	local v 0.5
	local Q 0.01
}

gen date_c=(date-`start_date')/1000 //centered date variable, rescaled so high order terms don't blow up in magnitude
gen ln`pollutant'=ln(`pollutant')
if "$data"=="noise"{
	replace ln`pollutant'=`pollutant' //don't want to log, because then non-normal and <0 missing
	//i.e., assume the noise data as already log-normally distributed.
} 

*generate treatment and outcome
	gen treatment=0
		replace treatment=1 if date>=`start_date' & date<`start_date'+`length'
		replace treatment=0.5 if date>=`start_date'-`anticipation' & date<`start_date'
		
		if "$section"=="exponentialdecay"{
			replace treatment=1*(1-`expdecayrate')^(date-`start_date') if date>=`start_date'
		}
		
		if "$section"=="genlogdecay"{
			replace treatment=0
			replace treatment=`A'+(`K'-`A')/(1+`Q'*exp(-`B'*(date-`start_date'-`M')))^(1/`v') if date>=`start_date'
		}

		if "$section"=="phasein"{
			replace treatment=0
			replace treatment=(date-`start_date'+1)/`phaseinlength' if date>=`start_date' & date<`start_date'+`phaseinlength'
			replace treatment=1 if date>=`start_date'+`phaseinlength'		
		}
		
		sum treatment if date>=`start_date'
		local maxtreat=r(max)
		sum treatment if date>=`start_date'
		local mintreat=r(min)
		
	gen outcome=ln`pollutant'+`beta_true'*treatment
	
* Now put ourselves in the empiricist's shoes.
*variables for regression, assuming constant treatment effect at true start date
	foreach y of numlist 1/9{
		gen date_c`y'=date_c^`y' //centered date variables for global polynomial
	}
	gen post=0
		replace post=1 if date>=`start_date' //empiricist assumes known start date and constant treatment thereafter
	foreach y of numlist 1/9{
		gen pre_date_c`y'=date_c^`y'*(1-post) //separate pre- and post- polynomials
	}
	foreach y of numlist 1/9{	
		gen post_date_c`y'=date_c^`y'*(post) //run as two separate loops so ordering is convenient
	}
	quietly tab month, gen(MM)
	gen dow=dow(date)
	quietly tab dow, gen(DOW)
	if "$data"=="airquality"{
	foreach v in TempMax TempMin Rain Snow{
		quietly sum `v'
		local temp=r(mean)
		gen `v'2=(`v'-`temp')^2
		gen `v'3=(`v'-`temp')^3
	}
	}
	gen ym=ym(year,month)
if "$data"=="airquality"{
	local controls "MM* DOW* TempMax* TempMin* Rain* Snow*"
}

* augmented local linear
foreach bandwidth of numlist 30 {
	gen studyperiod`bandwidth'=1 if abs(date-`start_date')<=`bandwidth'
	replace studyperiod`bandwidth'=0 if studyperiod`bandwidth'==.
}
	
*NOW START LOOPING THROUGH MONITORS
foreach monitor of numlist $monitor_list{

*global polynomial, bic chosen, with controls
	*pre post with controls is equivalent to this with order 1
foreach order of numlist 1/9{
	reg outcome post `controls' date_c1-date_c`order' ///
	if abs(date-`start_date')<=365*4 & monitor==`monitor', vce(cluster ym)
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "$data,$section,`monitor',`pollutant',"
	file write simulate "`beta_true',`start_date',"
	file write simulate "`length',`anticipation',`expdecayrate',genlogdecay_`b'_`M',`phaseinlength',"
	file write simulate "global_poly,order_`order',`bic',`r2',`n',"
	file write simulate "`estimate',`se',"
	file write simulate "`mintreat',`maxtreat'" _n
}
*separate polynomial, bic chosen, with controls
	*pre post with controls is equivalent to this with order 1
foreach preorder of numlist 1/3{
foreach postorder of numlist 1/3{
	reg outcome post `controls' pre_date_c1-pre_date_c`preorder' post_date_c1-post_date_c`postorder' ///
	if abs(date-`start_date')<=365*4 & monitor==`monitor', vce(cluster ym)
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "$data,$section,`monitor',`pollutant',"
	file write simulate "`beta_true',`start_date',"
	file write simulate "`length',`anticipation',`expdecayrate',genlogdecay_`b'_`M',`phaseinlength',"
	file write simulate "separate_poly,order_`preorder'_`postorder',`bic',`r2',`n',"
	file write simulate "`estimate',`se',"
	file write simulate "`mintreat',`maxtreat'" _n
}
}
*local linear
foreach bandwidth of numlist 30 {
	reg outcome post `controls' date_c if abs(date-`start_date')<=`bandwidth' & monitor==`monitor'
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "$data,$section,`monitor',`pollutant',"
	file write simulate "`beta_true',`start_date',"
	file write simulate "`length',`anticipation',`expdecayrate',genlogdecay_`b'_`M',`phaseinlength',"
	file write simulate "local_linear,bandwidth_`bandwidth',`bic',`r2',`n',"
	file write simulate "`estimate',`se',"
	file write simulate "`mintreat',`maxtreat'" _n
}
*augmented local linear
foreach bandwidth of numlist 30 {
	reg outcome `controls' if abs(date-`start_date')<=365*4 & monitor==`monitor'
	predict temp, resid
	reg temp post date_c if studyperiod`bandwidth'==1 & monitor==`monitor'
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	drop temp
	file write simulate "$data,$section,`monitor',`pollutant',"
	file write simulate "`beta_true',`start_date',"
	file write simulate "`length',`anticipation',`expdecayrate',genlogdecay_`b'_`M',`phaseinlength',"
	file write simulate "local_linear_fullsample,bandwidth_`bandwidth',`bic',`r2',`n',"
	file write simulate "`estimate',`se',"
	file write simulate "`mintreat',`maxtreat'" _n
}

*END OF MONITOR LOOP (FIRST)
}

*monthly collapse
if "$data"=="airquality"{
	collapse (mean) outcome TempMax TempMin Rain Snow, by(year month monitor)
}
if "$data"=="noise"{
	collapse (mean) outcome, by(year month monitor)
}
gen date=mdy(month,1,year)
gen date_c=(date-`start_date')/1000
	gen post=0
		replace post=1 if date>=`start_date'
	quietly tab month, gen(MM)
	if "$data"=="airquality"{
	foreach v in TempMax TempMin Rain Snow{
		quietly sum `v'
		local temp=r(mean)
		gen `v'2=(`v'-`temp')^2
		gen `v'3=(`v'-`temp')^3
	}
	}
if "$data"=="airquality"{
	local controls "MM* TempMax* TempMin* Rain* Snow*"
}

*RE-START MONITOR LOOPS
foreach monitor of numlist $monitor_list{

	reg outcome post `controls' date_c if abs(date-`start_date')<=365*4 & monitor==`monitor'
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "$data,$section,`monitor',`pollutant',"
	file write simulate "`beta_true',`start_date',"
	file write simulate "`length',`anticipation',`expdecayrate',genlogdecay_`b'_`M',`phaseinlength',"
	file write simulate "monthlycollapse,monthlycollapse,`bic',`r2',`n',"
	file write simulate "`estimate',`se',"
	file write simulate "`mintreat',`maxtreat'" _n
}	
*END OF MONITOR LOOP (SECOND)
	
}
}
}
}
}
}
}
}
}
file close simulate

end

*-----------------------------------*
*------------Simulations------------*
*---------Air Quality Data----------*
*-----------------------------------*
//right now, 5 hours if you do all the parameter combinations listed below and all the regression
//specifications listed in the program. 

timer clear

timer on 1

global data "airquality"
global section "basic"
global monitor_list "1/88 90/94 96/110" //some of the monitors have too much missing data
global beta_true_list "-0.2"
global pollutant_list "ozone"
global start_date_list "12274 12614 12717 12804 13061 13344 13712 14471 14910 15678" //recall: randomly selected above
global length_list "1000000" //ie, entire post-period
global anticipation_list "0" 
global expdecayrate_list "0"
global genlog_blist "0(1)0"
global genlog_Mlist "0(1)0"
global phasein_list "1"
quietly RDIT_SIM

global data "airquality"
global section "sharpdecay"
global monitor_list "1/88 90/94 96/110"
global beta_true_list "-0.2"
global pollutant_list "ozone"
global start_date_list "12274 12614 12717 12804 13061 13344 13712 14471 14910 15678"
global length_list "30 365" //the treatment lasts only one month or one year
global anticipation_list "0"
global expdecayrate_list "0"
global genlog_blist "0(1)0"
global genlog_Mlist "0(1)0"
global phasein_list "1"
quietly RDIT_SIM

global data "airquality"
global section "sharpdecayfull"
global monitor_list "1/88 90/94 96/110"
global beta_true_list "-0.2"
global pollutant_list "ozone"
global start_date_list "12274 12614 12717 12804 13061 13344 13712 14471 14910 15678"
global length_list "0(100)1500" //treatment lasts 0 days, 100 days, 200 days, etc. up to 4 years of post data = 365*4
global anticipation_list "0"
global expdecayrate_list "0"
global genlog_blist "0(1)0"
global genlog_Mlist "0(1)0"
global phasein_list "1"
quietly RDIT_SIM

global data "airquality"
global section "genlogdecay"
global monitor_list "1/88 90/94 96/110"
global beta_true_list "-0.2 "
global pollutant_list "ozone"
global start_date_list "12274 12614 12717 12804 13061 13344 13712 14471 14910 15678"
global length_list "1000000"
global anticipation_list "0"
global expdecayrate_list "0"
global genlog_blist "50(25)150" //stepping over b parameter of generalized log decay
global genlog_Mlist "0(200)1000" //stepping over M parameter of generalized log decay
global phasein_list "1"
quietly RDIT_SIM

global data "airquality"
global section "phasein"
global monitor_list "1/88 90/94 96/110"
global beta_true_list "-0.2 "
global pollutant_list "ozone"
global start_date_list "12274 12614 12717 12804 13061 13344 13712 14471 14910 15678"
global length_list "1000000"
global anticipation_list "0"
global expdecayrate_list "0"
global genlog_blist "0(1)0"
global genlog_Mlist "0(1)0"
global phasein_list "7" //linear ramp-up of treatment of one day, one week, one month, one year
quietly RDIT_SIM

timer off 1
timer list 1


*-----------------------------------*
*-------------Simulations-----------*
*------Counterfactual is noise------*
*-----------------------------------*
//right now, 17 hours if you do all the parameter combinations listed below and all the regression
//specifications listed in the program. 

timer on 2 

//loop through all 1000 replications
global data "noise"
global section "basic"
global monitor_list "1/1000"
global beta_true_list "-0.2"
global pollutant_list "ozone"
global start_date_list "13344"
global length_list "1000000"
global anticipation_list "0"
global expdecayrate_list "0"
global genlog_blist "0(1)0"
global genlog_Mlist "0(1)0"
global phasein_list "1"
quietly RDIT_SIM

global data "noise"
global section "sharpdecay"
global monitor_list "1/1000"
global beta_true_list "-0.2"
global pollutant_list "ozone"
global start_date_list "13344"
global length_list "30 365"
global anticipation_list "0"
global expdecayrate_list "0"
global genlog_blist "0(1)0"
global genlog_Mlist "0(1)0"
global phasein_list "1"
quietly RDIT_SIM

global data "noise"
global section "sharpdecayfull"
global monitor_list "1/1000"
global beta_true_list "-0.2"
global pollutant_list "ozone"
global start_date_list "13344"
global length_list "0(100)1500" //limiting to 4 years of post data = 365*4
global anticipation_list "0"
global expdecayrate_list "0"
global genlog_blist "0(1)0"
global genlog_Mlist "0(1)0"
global phasein_list "1"
quietly RDIT_SIM

global data "noise"
global section "genlogdecay"
global monitor_list "1/1000"
global beta_true_list "-0.2 "
global pollutant_list "ozone"
global start_date_list "13344"
global length_list "1000000"
global anticipation_list "0"
global expdecayrate_list "0"
global genlog_blist "50(25)150"
global genlog_Mlist "0(200)1000"
global phasein_list "1"
quietly RDIT_SIM

global data "noise"
global section "phasein"
global monitor_list "1/1000"
global beta_true_list "-0.2 "
global pollutant_list "ozone"
global start_date_list "13344"
global length_list "1000000"
global anticipation_list "0"
global expdecayrate_list "0"
global genlog_blist "0(1)0"
global genlog_Mlist "0(1)0"
global phasein_list "7"
quietly RDIT_SIM

timer off 2
timer list 2




*-----------------------------------*
*----------Autoregression-----------*
*-----------------------------------*
//just one start date, 13344, and just one beta_true: -0.2
file open simulate using "results_autoregress.csv", write text replace

file write simulate "monitor,auto_parameter,ar1_included,ar1_estimate,ar1_standarderror,"
file write simulate "estimator,version,bic,r2,n,"
file write simulate "beta_hat,standard_error" _n

use counterfactual_data_autoregress, clear
xtset monitor date, daily
gen date_c=(date-13344)/1000
gen year=year(date)
gen month=month(date)
* Now put ourselves in the empiricist's shoes.
*variables for regression, assuming constant treatment effect at true start date
	foreach y of numlist 1/9{
		gen date_c`y'=date_c^`y'
	}
	foreach y of numlist 1/9{
		gen pre_date_c`y'=date_c^`y'*(1-post) //run as two separate loops so ordering is convenient
	}
	foreach y of numlist 1/9{	
		gen post_date_c`y'=date_c^`y'*(post)
	}
	gen ym=ym(year,month)
* augmented local linear 
foreach bandwidth of numlist 30 {
	gen studyperiod`bandwidth'=1 if abs(date-13344)<=`bandwidth'
	replace studyperiod`bandwidth'=0 if studyperiod`bandwidth'==.
}
	
foreach monitor of numlist 1/1000{
foreach auto_parameter of numlist 1(2)9{
*global polynomial, bic chosen, with controls
	*pre post with controls is equivalent to this with order 1
foreach order of numlist 1/9{
	reg counterfactual_`auto_parameter' post date_c1-date_c`order' ///
	if abs(date-13344)<=365*4 & monitor==`monitor', vce(cluster ym)
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "`monitor',`auto_parameter',no,,,"
	file write simulate "global_poly,order_`order',`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
}
foreach order of numlist 1/9{
	reg counterfactual_`auto_parameter' l.counterfactual_`auto_parameter' post date_c1-date_c`order' ///
	if abs(date-13344)<=365*4 & monitor==`monitor', vce(cluster ym)
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	lincom l.counterfactual_`auto_parameter'
		local ar1=r(estimate)
		local sear1=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "`monitor',`auto_parameter',yes,`ar1',`sear1',"
	file write simulate "global_poly,order_`order',`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
}
*separate polynomial, bic chosen, with controls
	*pre post with controls is equivalent to this with order 1
foreach preorder of numlist 1/3{
foreach postorder of numlist 1/3{
	reg counterfactual_`auto_parameter' post pre_date_c1-pre_date_c`preorder' post_date_c1-post_date_c`postorder' ///
	if abs(date-13344)<=365*4 & monitor==`monitor', vce(cluster ym)
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "`monitor',`auto_parameter',no,,,"
	file write simulate "separate_poly,order_`preorder'_`postorder',`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
}
}
foreach preorder of numlist 1/3{
foreach postorder of numlist 1/3{
	reg counterfactual_`auto_parameter' l.counterfactual_`auto_parameter' post pre_date_c1-pre_date_c`preorder' post_date_c1-post_date_c`postorder' ///
	if abs(date-13344)<=365*4 & monitor==`monitor', vce(cluster ym)
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	lincom l.counterfactual_`auto_parameter'
		local ar1=r(estimate)
		local sear1=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "`monitor',`auto_parameter',yes,`ar1',`sear1',"
	file write simulate "separate_poly,order_`preorder'_`postorder',`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
}
}
*local linear
foreach bandwidth of numlist 30 {
	reg counterfactual_`auto_parameter' post date_c ///
	if abs(date-13344)<=`bandwidth' & monitor==`monitor'
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "`monitor',`auto_parameter',no,,,"
	file write simulate "local_linear,bandwidth_`bandwidth',`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
}
foreach bandwidth of numlist 30 {
	reg counterfactual_`auto_parameter' l.counterfactual_`auto_parameter' post date_c ///
	if abs(date-13344)<=`bandwidth' & monitor==`monitor'
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	lincom l.counterfactual_`auto_parameter'
		local ar1=r(estimate)
		local sear1=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "`monitor',`auto_parameter',yes,`ar1',`sear1',"
	file write simulate "local_linear,bandwidth_`bandwidth',`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
}
*augmented local linear 
foreach bandwidth of numlist 30 {
	reg counterfactual_`auto_parameter' ///
		if abs(date-13344)<=365*4 & monitor==`monitor'
	predict temp, resid
	reg temp post date_c if studyperiod`bandwidth'==1 & monitor==`monitor'
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	drop temp
	file write simulate "`monitor',`auto_parameter',no,,,"
	file write simulate "local_linear_fullsample,bandwidth_`bandwidth',`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
}
foreach bandwidth of numlist 30 {
	reg counterfactual_`auto_parameter' l.counterfactual_`auto_parameter' ///
		if abs(date-13344)<=365*4 & monitor==`monitor'
	lincom l.counterfactual_`auto_parameter'
		local ar1=r(estimate)
		local sear1=r(se)
	predict temp, resid
	reg temp post date_c if studyperiod`bandwidth'==1 & monitor==`monitor'
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	drop temp
	file write simulate "`monitor',`auto_parameter',yes,`ar1',`sear1',"
	file write simulate "local_linear_fullsample,bandwidth_`bandwidth',`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
}
}
}
*monthly collapse
collapse (mean) counterfactual*, by(year month monitor)
gen date=mdy(month,1,year)
gen ym=ym(year, month)
gen date_c=(date-13344)/1000
	gen post=0
		replace post=1 if date>=13344
xtset monitor ym, monthly		
foreach monitor of numlist 1/1000{
foreach auto_parameter of numlist 1(2)9{		
	reg counterfactual_`auto_parameter' post date_c ///
	if abs(date-13344)<=365*4 & monitor==`monitor'
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "`monitor',`auto_parameter',no,,,"
	file write simulate "monthlycollapse,monthlycollapse,`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
	
	reg counterfactual_`auto_parameter' l.counterfactual_`auto_parameter' post date_c ///
	if abs(date-13344)<=365*4 & monitor==`monitor'
		local r2=e(r2)
		local n=e(N)
	lincom post
		local estimate=r(estimate)
		local se=r(se)
	lincom l.counterfactual_`auto_parameter'
		local ar1=r(estimate)
		local sear1=r(se)
	estat ic
		matrix A=r(S)
		local bic1=A[1,6]
		local bic=round(`bic1',0.01)
	file write simulate "`monitor',`auto_parameter',yes,`ar1',`sear1',"
	file write simulate "monthlycollapse,monthlycollapse,`bic',`r2',`n',"
	file write simulate "`estimate',`se'" _n
}
}
file close simulate




*-----------------------------------*
*--------Results and Graphs---------*
*-----------------------------------*	
//FIGURE 1:
***NO CONFOUNDERS
insheet using results_airquality_basic.csv, comma clear
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
bysort monitor beta_true start_date estimator: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.01)==round(minbic,0.01)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen orderpre=word(temp,2) if estimator=="separate_poly"
	gen orderpost=	word(temp,3) if estimator=="separate_poly"
	destring orderpre orderpost, replace
	drop temp
	
list estimator version beta_hat if monitor==25 & (best==1|(strmatch(estimator,"*poly*")==0)) ///
	& start_date==13344 //25 34 56 all are fairly representative across all specifications

global monitor 25
global start_date 13344
global beta_true -0.2	
use counterfactual_data_ak2009, clear
	gen date_c=(date-$start_date)/1000
	gen treatment=0
		replace treatment=1 if date>=$start_date
	gen outcome=ln(ozone)+$beta_true*treatment
	gen post=0
		replace post=1 if date>=$start_date
	foreach y of numlist 1/9{
		gen date_c`y'=date_c^`y'
	}
	foreach y of numlist 1/3{
		gen pre_date_c`y'=date_c^`y'*(1-post) //separate pre- and post- polynomials
	}
	foreach y of numlist 1/3{	
		gen post_date_c`y'=date_c^`y'*(post) //run as two separate loops so ordering is convenient
	}
	quietly tab month, gen(MM)
	gen dow=dow(date)
	quietly tab dow, gen(DOW)
	foreach v in TempMax TempMin Rain Snow{
		gen `v'2=`v'^2
		gen `v'3=`v'^3
	}
	gen ym=ym(year,month)
	gen studyperiod=1 if abs(date-$start_date)<=30
		replace studyperiod=0 if studyperiod==.
//panel A: global poly -- FOR MONITOR 25, USE POLY ORDER 2
	reg outcome MM* DOW* TempMax* TempMin* Rain* Snow* ///
		if monitor==$monitor & abs(date-$start_date)<4*365
	predict resid, residuals
	reg resid post date_c1-date_c2 ///
		if monitor==$monitor & abs(date-$start_date)<4*365
	predict fitted, xb
	gen fitted_pre=fitted if post==0
	gen fitted_post=fitted if post==1
	scatter resid fitted_pre fitted_post date ///
		if monitor==$monitor & abs(date-$start_date)<4*365 ///
		, xline($start_date, lcolor(gs12)) ///
		msize(small none none) msymbol(O none none) mcolor(gs8 black black) ///
		connect(none l l) lcolor(gs8 black black) lwidth(thin thick thick) ///
		graphregion(color(white)) xtitle("") ytitle("Simulated log ambient ozone (residuals)")  ///
		title("Global Polynomial") legend(off)
	graph export $imagespath/basicpolytext.eps, as(eps) replace
	graph export $imagespath/basicpolytext.png, as(png) width(900) replace
	scatter resid fitted_pre fitted_post date ///
		if monitor==$monitor & abs(date-$start_date)<4*365 ///
		, xline($start_date, lcolor(gs12)) ///
		msize(small none none) msymbol(O none none) mcolor(gs8 black black) ///
		connect(none l l) lcolor(gs8 black black) lwidth(thin thick thick) ///
		graphregion(color(white)) xtitle("") ytitle("Simulated log ambient ozone (residuals)")  ///
		title("Panel A") legend(off)
	graph export $imagespath/basicpoly.eps, as(eps) replace
	graph export $imagespath/basicpoly.png, as(png) width(900) replace
	drop resid fitted*
//panel b: separate poly FOR MONITOR 25, USE POLY ORDERS 1 AND 2	
	reg outcome MM* DOW* TempMax* TempMin* Rain* Snow* ///
		if monitor==$monitor & abs(date-$start_date)<4*365
	predict resid, residuals
	reg resid post pre_date_c1-pre_date_c1 post_date_c1-post_date_c2 ///
		if monitor==$monitor & abs(date-$start_date)<4*365
	predict fitted, xb
	gen fitted_pre=fitted if post==0
	gen fitted_post=fitted if post==1
	scatter resid fitted_pre fitted_post date ///
		if monitor==$monitor & abs(date-$start_date)<4*365 ///
		, xline($start_date, lcolor(gs12)) ///
		msize(small none none) msymbol(O none none) mcolor(gs8 black black) ///
		connect(none l l) lcolor(gs8 black black) lwidth(thin thick thick) ///
		graphregion(color(white)) xtitle("") ytitle("Simulated log ambient ozone (residuals)")  ///
		 title("Panel B") legend(off)
	graph export $imagespath/basicseparatepoly.eps, as(eps) replace
	graph export $imagespath/basicseparatepoly.png, as(png) width(900) replace
	drop resid fitted*
//panel c: local linear	
	reg outcome MM* DOW* TempMax* TempMin* Rain* Snow* ///
		if monitor==$monitor & abs(date-$start_date)<=30
	predict resid, residuals
	reg resid post date_c1 ///
		if monitor==$monitor & abs(date-$start_date)<=30
	predict fitted, xb
	gen fitted_pre=fitted if post==0
	gen fitted_post=fitted if post==1
	scatter resid fitted_pre fitted_post date ///
		if monitor==$monitor & abs(date-$start_date)<30 ///
		, xline($start_date, lcolor(gs12)) ///
		msize(small none none) msymbol(O none none) mcolor(gs8 black black) ///
		connect(none l l) lcolor(gs8 black black) lwidth(thin thick thick) ///
		graphregion(color(white)) xtitle("") ytitle("Simulated log ambient ozone (residuals)")  ///
		 title("Local Linear") legend(off)
	graph export $imagespath/basiclintext.eps, as(eps) replace
	graph export $imagespath/basiclintext.png, as(png) width(900) replace
	scatter resid fitted_pre fitted_post date ///
		if monitor==$monitor & abs(date-$start_date)<30 ///
		, xline($start_date, lcolor(gs12)) ///
		msize(small none none) msymbol(O none none) mcolor(gs8 black black) ///
		connect(none l l) lcolor(gs8 black black) lwidth(thin thick thick) ///
		graphregion(color(white)) xtitle("") ytitle("Simulated log ambient ozone (residuals)")  ///
		 title("Panel C") legend(off)
	graph export $imagespath/basiclin.eps, as(eps) replace
	graph export $imagespath/basiclin.png, as(png) width(900) replace
	drop resid fitted*
//panel d,e: augmented local linear
	reg outcome MM* DOW* TempMax* TempMin* Rain* Snow* ///
		if monitor==$monitor & abs(date-$start_date)<=365*4
	predict resid, residuals
	reg resid post date_c1 ///
		if monitor==$monitor & abs(date-$start_date)<=30
	predict fitted, xb
	gen fitted_pre=fitted if post==0 & studyperiod==1
	gen fitted_post=fitted if post==1 & studyperiod==1
	scatter resid fitted_pre fitted_post date ///
		if monitor==$monitor & abs(date-$start_date)<=365*4 ///
		, xline($start_date, lcolor(gs12)) ///
		msize(small none none) msymbol(O none none) mcolor(gs8 black black) ///
		connect(none l l) lcolor(gs8 black black) lwidth(thin thick thick) ///
		graphregion(color(white)) xtitle("") ytitle("Simulated log ambient ozone (residuals)")  ///
		 title("Panel D") legend(off)
	graph export $imagespath/basiclinfullsample.eps, as(eps) replace
	graph export $imagespath/basiclinfullsample.png, as(png) width(900) replace
	scatter resid fitted_pre fitted_post date ///
		if monitor==$monitor & abs(date-$start_date)<=30 ///
		, xline($start_date, lcolor(gs12)) ///
		msize(small none none) msymbol(O none none) mcolor(gs8 black black) ///
		connect(none l l) lcolor(gs8 black black) lwidth(thin thick thick) ///
		graphregion(color(white)) xtitle("") ytitle("Simulated log ambient ozone (residuals)")  ///
		 title("Panel E") legend(off)
	graph export $imagespath/basiclinfullsamplezoomed.eps, as(eps) replace
	graph export $imagespath/basiclinfullsamplezoomed.png, as(png) width(900) replace
	drop resid fitted*
//panel f: monthly pre/post
	collapse (mean) outcome TempMax TempMin Rain Snow, by(monitor year month)
	gen date=mdy(month,1,year)
		format %dD_m_CY date
	gen date_c=(date-$start_date)/1000
	gen post=0
		replace post=1 if date>=$start_date
	quietly tab month, gen(MM)
	foreach v in TempMax TempMin Rain Snow{
		gen `v'2=`v'^2
		gen `v'3=`v'^3
	}
	reg outcome MM* TempMax* TempMin* Rain* Snow* ///
		if monitor==$monitor & abs(date-$start_date)<=365*4
	predict resid, residuals
	reg resid post date_c if monitor==$monitor & abs(date-$start_date)<=365*4
	predict fitted, xb
	gen fitted_pre=fitted if post==0
	gen fitted_post=fitted if post==1
	scatter resid fitted_pre fitted_post date ///
		if monitor==$monitor & abs(date-$start_date)<=365*4 ///
		, xline($start_date, lcolor(gs12)) ///
		msize(small none none) msymbol(O none none) mcolor(gs8 black black) ///
		connect(none l l) lcolor(gs8 black black) lwidth(thin thick thick) ///
		graphregion(color(white)) xtitle("") ytitle("Simulated log ambient ozone (residuals)")  ///
		 title("Panel F") legend(off)
	graph export $imagespath/basicprepost.eps, as(eps) replace
	graph export $imagespath/basicprepost.png, as(png) width(900) replace

	
	

//TABLE 1: 
***RDIT ESTIMATES, NO CONFOUNDERS
insheet using results_airquality_basic.csv, comma clear
/*
codebook monitor //108 monitors
tab pollutant // 1 pollutant, ozone
tab beta_t //1 betas
tab start_d //10 start dates
sum length anticipation expdecay phasein //noconfounders
tab genlog //no confounders
sum mintreat maxtreat //constant treatment effect
*/
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
isid monitor beta_true start_date estimator version
bysort monitor beta_true start_date estimator: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.01)==round(minbic,0.01)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen orderpre=word(temp,2) if estimator=="separate_poly"
	gen orderpost=	word(temp,3) if estimator=="separate_poly"
	destring orderpre orderpost, replace
	drop temp
sum beta_hat order n if best==1 & estimator=="global_poly"
sum beta_hat orderpre orderpost n if best==1 & estimator=="separate_poly"
	/*
	//bic tie for monitor
	duplicates tag monitor start_date if estimator=="global_poly" & best==1, gen(dup)
	tab dup
	drop dup
	duplicates tag monitor start_date if estimator=="separate_poly" & best==1, gen(dup)
	tab dup
	drop dup
	*/
sum beta_hat n if estimator=="local_linear" & version=="bandwidth_30" 
sum beta_hat n if estimator=="local_linear_fullsample" & version=="bandwidth_30" 
sum beta_hat n if estimator=="monthlycollapse" 

	
insheet using results_noise_basic.csv, comma clear
/*
codebook monitor //1000 monitors
tab pollutant // 1 pollutant, ozone
tab beta_t //1 betas
tab start_d //1 start date
sum length anticipation expdecay phasein //noconfounders
tab genlog //no confounders
sum mintreat maxtreat //constant treatment effe ct
*/
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
isid monitor beta_true start_date estimator version
bysort monitor beta_true start_date estimator: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.01)==round(minbic,0.01)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen orderpre=word(temp,2) if estimator=="separate_poly"
	gen orderpost=	word(temp,3) if estimator=="separate_poly"
	destring orderpre orderpost, replace
	drop temp
sum beta_hat order n if best==1 & estimator=="global_poly"
sum beta_hat orderpre orderpost n if best==1 & estimator=="separate_poly"
	/*
	//bic tie for monitor
	duplicates tag monitor start_date if estimator=="global_poly" & best==1, gen(dup)
	tab dup
	drop dup
	duplicates tag monitor start_date if estimator=="separate_poly" & best==1, gen(dup)
	tab dup
	drop dup
	*/
sum beta_hat n if estimator=="local_linear" & version=="bandwidth_30" 
sum beta_hat n if estimator=="local_linear_fullsample" & version=="bandwidth_30" 
sum beta_hat n if estimator=="monthlycollapse" 


//FIGURE 2: 
***SHARP DECAY, BIAS TOO LARGE
insheet using results_airquality_sharpdecay.csv, comma clear
replace bic=. if estimator!="global_poly"
isid monitor start_date estimator version length
bysort monitor start_date estimator length: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.0001)==round(minbic,0.0001)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen preorder=word(temp,2)
	gen postorder=word(temp,3)
	destring preorder postorder, replace
list monitor beta_hat order if best==1 & start_d==13344 & length==365 & estimator=="global_poly"
global monitor 5
global order 8
global crop 1.5
	global start_date 13344
	global beta_true -0.2
	global length 365
	use counterfactual_data_ak2009, clear
	gen date_c=(date-$start_date)/1000
	gen treatment=0
		replace treatment=1 if date>=$start_date & date<$start_date+$length
	gen outcome=ln(ozone)+$beta_true*treatment
	gen post_observed=0
		replace post_observed=1 if date>=$start_date
	gen post_true=treatment
	foreach y of numlist 1/9{
		gen date_c`y'=date_c^`y'
	}
	quietly tab month, gen(MM)
	gen dow=dow(date)
	quietly tab dow, gen(DOW)
	foreach v in TempMax TempMin Rain Snow{
		gen `v'2=`v'^2
		gen `v'3=`v'^3
	}
	gen ym=ym(year,month)
	reg outcome MM* DOW* TempMax* TempMin* Rain* Snow* ///
		if monitor==$monitor & abs(date-$start_date)<=365*4
	predict resid, residuals
	reg resid post_observed date_c1-date_c$order ///
		if monitor==$monitor & abs(date-$start_date)<=365*4
	predict fitted_observed, xb
	reg resid post_true date_c1-date_c$order ///
		if monitor==$monitor & abs(date-$start_date)<=365*4
	predict fitted_true, xb
	label variable fitted_observed "Observed Fitted"
	label variable fitted_true "True Fitted"
	/*
	scatter resid fitted_o fitted_t date ///
		if monitor==$monitor & abs(date-$start_date)<=365*4, xline($start_date, lcolor(gs8)) ///
		msize(small tiny small) msymbol(Oh O O) connect(none none none) ///
		mcolor(gs10 black gs6) ///
		graphregion(color(white)) xtitle("")
	*/
	//not plotting a few outliers:
	//separate scatters so that we  can use connect (to make nice legend) but without a line at the discontinuities
	twoway (scatter resid date if monitor==$monitor & abs(date-$start_date)<=365*4 ///
		& abs(resid)<$crop, xline($start_date, lcolor(gs12)) ///
		msize(small) msymbol(O) connect(none) mcolor(gs12) ///
		lcolor(gs12)  ///
		graphregion(color(white)) ytitle("Simulated log ambient ozone (residuals)")) ///
	(scatter fitted_o fitted_t date if monitor==$monitor & abs(date-$start_date)<=365*4 ///
		& (date-$start_date)<0 ///
		& abs(resid)<$crop, xline($start_date, lcolor(gs12)) ///
		msymbol(none none) connect(l l) ///
		lcolor(black gs6) lwidth(thick vthick) ///
		graphregion(color(white)) ytitle("Simulated log ambient ozone (residuals)")) ///
	(scatter fitted_o fitted_t date if monitor==$monitor & abs(date-$start_date)<=365*4 ///
		& (date-$start_date)>0 & abs(date-$start_date)<365 ///
		& abs(resid)<$crop, xline($start_date, lcolor(gs12)) ///
		msymbol(none none) connect(l l) ///
		lcolor(black gs6) lwidth(thick vthick) ///
		graphregion(color(white))ytitle("Simulated log ambient ozone (residuals)")) ///
	(scatter fitted_o fitted_t date if monitor==$monitor & abs(date-$start_date)<=365*4 ///
		& (date-$start_date)>0 & abs(date-$start_date)>365 ///
		& abs(resid)<$crop, xline($start_date, lcolor(gs12)) ///
		msymbol(none none) connect(l l) ///
		lcolor(black gs6) lwidth(thick vthick) legend(order(1 2 3) col(1)) ///
		graphregion(color(white)) xtitle("") ytitle("Simulated log ambient ozone (residuals)"))
	graph export $imagespath/sharpdecaypoly.eps, as(eps) replace
	graph export $imagespath/sharpdecaypoly.png, as(png) width(1800) replace
	count if monitor==$monitor & abs(date-$start_date)<=365*4 & abs(resid)>=$crop & resid!=.


***TABLE: SHARP DECAY	
insheet using results_airquality_sharpdecay.csv, comma clear
/*
codebook monitor //108 monitors
tab pollutant // 1 pollutant, ozone
tab beta_t //1 betas
tab start_d //10 start dates
tab length //two lengths: one month and one year
sum anticipation expdecay phasein //no other confounders
tab genlog //no other confounders
*/
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
isid monitor start_date estimator version length
bysort monitor start_date estimator length: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.0001)==round(minbic,0.0001)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen preorder=word(temp,2)
	gen postorder=word(temp,3)
	destring preorder postorder, replace
sort monitor start_date length estimator version
format beta_hat order preorder postorder %12.2fc
format n %12.1fc
foreach length of numlist 30 365{
display "best, `length', global poly"
sum beta_hat order n if best==1 & length==`length' & estimator=="global_poly", format
display "best, `length', separate poly"
sum beta_hat preorder postorder n if best==1 & length==`length' & estimator=="separate_poly", format
display "local_linear, `length'"
sum beta_hat n if estimator=="local_linear" & version=="bandwidth_30" & length==`length', format
display "local_linear_fullsample, `length'"
sum beta_hat n if estimator=="local_linear_fullsample" & version=="bandwidth_30" & length==`length', format
display "monthlycollapse, `length'"
sum beta_hat n if estimator=="monthlycollapse" & length==`length', format
}	

insheet using results_noise_sharpdecay.csv, comma clear
/*
codebook monitor //1000 monitors
tab pollutant // 1 pollutant, ozone
tab beta_t //1 betas
tab start_d //1 start date
tab length //two lengths: one month and one year
sum anticipation expdecay phasein //no other confounders
tab genlog //no other confounders
*/
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
isid monitor start_date estimator version length
bysort monitor start_date estimator length: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.0001)==round(minbic,0.0001)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen preorder=word(temp,2)
	gen postorder=word(temp,3)
	destring preorder postorder, replace
sort monitor start_date length estimator version
format beta_hat order preorder postorder %12.2fc
format n %12.1fc
foreach length of numlist 30 365{
display "best, `length', global poly"
sum beta_hat order n if best==1 & length==`length' & estimator=="global_poly", format
display "best, `length', separate poly"
sum beta_hat preorder postorder n if best==1 & length==`length' & estimator=="separate_poly", format
display "local_linear, `length'"
sum beta_hat n if estimator=="local_linear" & version=="bandwidth_30" & length==`length', format
display "local_linear_fullsample, `length'"
sum beta_hat n if estimator=="local_linear_fullsample" & version=="bandwidth_30" & length==`length', format
display "monthlycollapse, `length'"
sum beta_hat n if estimator=="monthlycollapse" & length==`length', format
}	


//APPENDIX	
***FIGURE: SHARP DECAY
insheet using results_airquality_sharpdecayfull.csv, comma clear
replace bic=. if estimator!="global_poly"
isid monitor start_date estimator version length
bysort monitor start_date estimator length: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.0001)==round(minbic,0.0001)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.)
	destring order, force replace
sort monitor start_date length estimator version
bysort length order: egen mean_byorder=mean(beta_hat)
scatter mean_byorder length if order==1, ///
	mcolor(black) lcolor(black) connect(l) msymbol(none) lwidth(vthick) ///
	graphregion(color(white)) yline(0, lcolor(gs6)) yline(-0.2, lcolor(gs6)) ///
	xtitle("True treatment length, days") ytitle("Mean of beta estimates")
graph export $imagespath/sharpdecayfull_order1.eps, as(eps) replace	
graph export $imagespath/sharpdecayfull_order1.png, as(png) width(1800) replace	

label variable order "order of polynomial"
scatter mean_byorder length, ///
	by(order, legend(off) graphregion(color(white))) ///
	mcolor(black) lcolor(black) connect(l) msymbol(none) lwidth(vthick) ///
	graphregion(color(white)) yline(0, lcolor(gs6)) yline(-0.2, lcolor(gs6)) ///
	xtitle("True treatment length, days")  ytitle("Mean of beta estimates")
graph export $imagespath/sharpdecayfull_orderall.eps, as(eps) replace
graph export $imagespath/sharpdecayfull_orderall.png, as(png) width(1800) replace

bysort length: egen meanbest=mean(beta_hat) if best==1
scatter meanbest length if best==1, ///
	mcolor(black) lcolor(black) connect(l) msymbol(none) lwidth(vthick) ///
	graphregion(color(white)) yline(0, lcolor(gs6)) yline(-0.2, lcolor(gs6)) ///
	xtitle("True treatment length, days")  ytitle("Mean of beta estimates")  
graph export $imagespath/sharpdecayfull_orderbest.eps, as(eps) replace
graph export $imagespath/sharpdecayfull_orderbest.png, as(png) width(1800) replace


//APPENDIX	
***FIGURE: GENERALIZED LOGISTIC DECAY FUNCTION
use counterfactual_data_ak2009, clear
keep date
duplicates drop
forval b=50(25)150{
forval M=0(200)1000{
local beta_true -0.2
local start_date 13344
local A 0
local K 1
local B=-`b'/10000
local v 0.5
local Q 0.01
*generate treatment and outcome
	gen treatment_b`b'_M`M'=0
		replace treatment_b`b'_M`M'=`beta_true' * (`A'+(`K'-`A')/(1+`Q'*exp(-`B'*(date-`start_date'-`M')))^(1/`v')) if date>=`start_date'
	sum treatment_b`b'_M`M' if date==13344+365*4
	local temp=r(mean)
	display "`temp'"
	if `temp'<(`beta_true' * 0.1){
		drop treatment_b`b'_M`M'
}
}
}
label variable treatment_b150_M0 "B = -0.015, M = 0"
label variable treatment_b50_M0 "B = -0.005, M = 0"
label variable treatment_b50_M200 "B = -0.005, M = 200"
label variable treatment_b150_M1000 "B = -0.015, M = 1000"
scatter treatment_b150_M0 treatment_b50_M0 treatment_b50_M200 treatment_b150_M1000 date ///
	if abs(date-$start_date)<=365*4, ///
	mcolor(black gs6 gs10 gs12) msize(vsmall small small med) ///
	graphregion(color(white)) xtitle("") 
graph export $imagespath/logisticdecayfunctions.eps, as(eps) replace
graph export $imagespath/logisticdecayfunctions.png, as(png) width(1800) replace

//TABLE 3: 
***GENERALIZED LOGISTIC DECAY
insheet using results_airquality_genlogdecay.csv, comma clear
/*treatment has decayed by at least 90% (see above): 
b50: m0 to m200
b75: m0 to m600
b100: m0 to m800
b125: m0 to m1000
b150: m0 to m1000
*/
drop if genlogdecay=="genlogdecay_50_400"
drop if genlogdecay=="genlogdecay_50_600"
drop if genlogdecay=="genlogdecay_50_800"
drop if genlogdecay=="genlogdecay_50_1000"
drop if genlogdecay=="genlogdecay_75_800"
drop if genlogdecay=="genlogdecay_75_1000"
drop if genlogdecay=="genlogdecay_100_1000"
/*
codebook monitor //108 monitors
tab pollutant // 1 pollutant, ozone
tab beta_t //1 betas
tab start_d //10 start dates
tab genlog //5 b parameters, up to 6 M parameters
sum length anticipation expdecay phasein //no other confounders
*/
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
isid monitor beta_true start_date estimator version genlog 
bysort monitor beta_true start_date estimator genlog: egen minbic=min(bic) 
	gen best_poly=1 if round(bic,0.00001)==round(minbic,0.00001) 
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen orderpre=word(temp,2) if estimator=="separate_poly"
	gen orderpost=	word(temp,3) if estimator=="separate_poly"
	destring orderpre orderpost, replace
	drop temp
gen temp=subinstr(genlog,"_"," ",.)
gen decayb=word(temp,2)
gen decayM=word(temp,3)
destring decay*, replace
gen decayB=-decayb/10000 //so that table is in B, not b, scaling
table decayB decayM if best==1 & estimator=="global_poly", contents(mean beta_hat sd beta_hat n beta_hat) format (%8.2f)
//NOT USED:
table decayB decayM if best==1 & estimator=="separate_poly", contents(mean beta_hat) format(%8.2f)
table decayB decayM if estimator=="local_linear", contents(mean beta_hat) format(%8.2f)
table decayB decayM if estimator=="local_linear_fullsample", contents(mean beta_hat) format(%8.2f)
table decayB decayM if estimator=="monthlycollapse", contents(mean beta_hat) format(%8.2f)

	

insheet using results_noise_genlogdecay.csv, comma clear
/*treatment has decayed by at least 90% (see above): 
b50: m0 to m200
b75: m0 to m600
b100: m0 to m800
b125: m0 to m1000
b150: m0 to m1000
*/
drop if genlogdecay=="genlogdecay_50_400"
drop if genlogdecay=="genlogdecay_50_600"
drop if genlogdecay=="genlogdecay_50_800"
drop if genlogdecay=="genlogdecay_50_1000"
drop if genlogdecay=="genlogdecay_75_800"
drop if genlogdecay=="genlogdecay_75_1000"
drop if genlogdecay=="genlogdecay_100_1000"
/*
codebook monitor //1000 monitors
tab pollutant // 1 pollutant, ozone
tab beta_t //1 betas
tab start_d //1 start date
tab genlog //5 b parameters, up to 6 M parameters
sum length anticipation expdecay phasein //no other confounders
*/
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
isid monitor beta_true start_date estimator version genlog 
bysort monitor beta_true start_date estimator genlog: egen minbic=min(bic) 
	gen best_poly=1 if round(bic,0.00001)==round(minbic,0.00001) 
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen orderpre=word(temp,2) if estimator=="separate_poly"
	gen orderpost=	word(temp,3) if estimator=="separate_poly"
	destring orderpre orderpost, replace
	drop temp
gen temp=subinstr(genlog,"_"," ",.)
gen decayb=word(temp,2)
gen decayM=word(temp,3)
destring decay*, replace
gen decayB=-decayb/10000 //so that table is in B, not b, scaling
table decayB decayM if best==1 & estimator=="global_poly", contents(mean beta_hat sd beta_hat n beta_hat) format (%8.2f)
//NOT USED:
table decayB decayM if best==1 & estimator=="separate_poly", contents(mean beta_hat) format(%8.2f)
table decayB decayM if estimator=="local_linear", contents(mean beta_hat) format(%8.2f)
table decayb decayM if estimator=="local_linear_fullsample", contents(mean beta_hat) format(%8.2f)
table decayB decayM if estimator=="monthlycollapse", contents(mean beta_hat) format(%8.2f)



	
***TABLE: LINEAR PHASEIN
//NOT USING: DIFFERENT PHASE-IN LENGTHS. ONLY SHOWING 7 DAYS, FOR BREVITY.
insheet using results_airquality_phasein.csv, comma clear
/*
codebook monitor //108 monitors
tab pollutant // 1 pollutant, ozone
tab beta_t //1 betas
tab start_d //10 start dates
tab phasein //1 phase-in length
sum length anticipation expdecay //no other confounders
tab genlog //no other confounders
*/
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
isid monitor start_date estimator version phasein
bysort monitor start_date estimator phasein: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.0001)==round(minbic,0.0001)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen preorder=word(temp,2)
	gen postorder=word(temp,3)
	destring preorder postorder, replace
foreach phasein of numlist 7 {
display "best, `phasein', global poly"
sum beta_hat order n if best==1 & phasein==`phasein' & estimator=="global_poly"
display "best, `phasein', separate poly"
sum beta_hat preorder postorder n if best==1 & phasein==`phasein' & estimator=="separate_poly"
display "local_linear, `phasein'"
sum beta_hat n if estimator=="local_linear" & version=="bandwidth_30" & phasein==`phasein'
display "local_linear_fullsample, `phasein'"
sum beta_hat n if estimator=="local_linear_fullsample" & version=="bandwidth_30" & phasein==`phasein'
display "monthlycollapse, `phasein'"
sum beta_hat n if estimator=="monthlycollapse" & phasein==`phasein'
}	

insheet using results_noise_phasein.csv, comma clear
/*
codebook monitor //1000 monitors
tab pollutant // 1 pollutant, ozone
tab beta_t //1 betas
tab start_d //1 start date
tab phasein //1 phase-in length
sum length anticipation expdecay //no other confounders
tab genlog //no other confounders
*/
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
isid monitor start_date estimator version phasein
bysort monitor start_date estimator phasein: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.0001)==round(minbic,0.0001)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen preorder=word(temp,2)
	gen postorder=word(temp,3)
	destring preorder postorder, replace
foreach phasein of numlist 7 {
display "best, `phasein', global poly"
sum beta_hat order n if best==1 & phasein==`phasein' & estimator=="global_poly"
display "best, `phasein', separate poly"
sum beta_hat preorder postorder n if best==1 & phasein==`phasein' & estimator=="separate_poly"
display "local_linear, `phasein'"
sum beta_hat n if estimator=="local_linear" & version=="bandwidth_30" & phasein==`phasein'
display "local_linear_fullsample, `phasein'"
sum beta_hat n if estimator=="local_linear_fullsample" & version=="bandwidth_30" & phasein==`phasein'
display "monthlycollapse, `phasein'"
sum beta_hat n if estimator=="monthlycollapse" & phasein==`phasein'
}
	
	
//AUTOREGRESSION
insheet using results_autoregress.csv, comma clear
isid monitor auto_parameter ar1_included estimator version

/*
codebook monitor //1000 monitors
tab auto_parameter
tab ar1_included
*/
replace bic=. if estimator!="global_poly" & estimator!="separate_poly"
isid monitor auto_parameter ar1_included estimator version
bysort monitor auto_parameter ar1_included estimator: egen minbic=min(bic)
	gen best_poly=1 if round(bic,0.01)==round(minbic,0.01)
	replace best_poly=. if bic==.
gen order=subinstr(version,"order_","",.) if estimator=="global_poly"
	destring order, force replace
	gen temp=subinstr(version,"_"," ",.) if estimator=="separate_poly"
	gen preorder=word(temp,2)
	gen postorder=word(temp,3)
	destring preorder postorder, replace
	
table auto_parameter if ar1_included=="no" & best==1 & estimator=="global_poly", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="no" & best==1 & estimator=="global_poly", ///
	contents(mean order sd order n beta_hat n ar1_estimate) format (%8.2f)	
table auto_parameter if ar1_included=="no" & best==1 & estimator=="separate_poly", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="no" & best==1 & estimator=="separate_poly", ///
	contents(mean preorder mean postorder sd preorder sd postorder) format (%8.2f)
table auto_parameter if ar1_included=="no" & best==1 & estimator=="separate_poly", ///
	contents(n beta_hat n ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="no" & estimator=="local_linear" & version=="bandwidth_30", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="no" & estimator=="local_linear" & version=="bandwidth_30", ///
	contents(mean order sd order n beta_hat n ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="no" & estimator=="local_linear_fullsample" & version=="bandwidth_30", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="no" & estimator=="local_linear_fullsample" & version=="bandwidth_30", ///
	contents(mean order sd order n beta_hat n ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="no" & estimator=="monthlycollapse", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="no" & estimator=="monthlycollapse", ///
	contents(mean order sd order n beta_hat n ar1_estimate) format (%8.2f)
	
table auto_parameter if ar1_included=="yes" & best==1 & estimator=="global_poly", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="yes" & best==1 & estimator=="global_poly", ///
	contents(mean order sd order n beta_hat n ar1_estimate) format (%8.2f)	
table auto_parameter if ar1_included=="yes" & best==1 & estimator=="separate_poly", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="yes" & best==1 & estimator=="separate_poly", ///
	contents(mean preorder mean postorder sd preorder sd postorder) format (%8.2f)
table auto_parameter if ar1_included=="yes" & best==1 & estimator=="separate_poly", ///
	contents(n beta_hat n ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="yes" & estimator=="local_linear" & version=="bandwidth_30", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="yes" & estimator=="local_linear" & version=="bandwidth_30", ///
	contents(mean order sd order n beta_hat n ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="yes" & estimator=="local_linear_fullsample" & version=="bandwidth_30", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="yes" & estimator=="local_linear_fullsample" & version=="bandwidth_30", ///
	contents(mean order sd order n beta_hat n ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="yes" & estimator=="monthlycollapse", ///
	contents(mean beta_hat sd beta_hat mean ar1_estimate sd ar1_estimate) format (%8.2f)
table auto_parameter if ar1_included=="yes" & estimator=="monthlycollapse", ///
	contents(mean order sd order n beta_hat n ar1_estimate) format (%8.2f)

	
	
//ESTIMATING AR(1) PARAMETERS	

 ////////////////////////////////
//DAILY AIR QUALITY DATA FROM EPA
/////////////////////////////////
clear all
foreach y of numlist 2004/2005{
foreach variable in 42101 42401 42602 44201 81102 88101{
	disp "******* `variable', `y' **********"
	insheet using daily_`variable'_`y'.csv, comma clear
	tab parametern
	tab units
	sum arithmeticmean
	sum stmaxvalue
}
}

clear all
foreach y of numlist 2004/2005{
foreach variable in 42101 42401 42602 44201 81102 88101{
	insheet using daily_`variable'_`y'.csv, comma clear
	capture destring statecode, force replace
	save daily_`variable'_`y', replace
}
}
clear all
foreach y of numlist 2004/2005{
foreach variable in 42101 42401 42602 44201 81102 88101{
	append using daily_`variable'_`y'
	capture drop pollutantstandard //string versus byte
}
}
replace parametername="co" if strmatch(parametername,"*monoxide*")
	replace parametername="no2" if strmatch(parametername,"*Nitrogen*")
	replace parametername="ozone" if strmatch(parametername,"*Ozone*")
	replace parametername="so2" if strmatch(parametername,"*Sulfur*") 
	replace parametername="pm25" if parametername=="PM2.5 - Local Conditions"
	replace parametername="pm10" if parametername=="PM10 Total 0-10um STP"
drop countyname cityname cbsaname datum 
gen event=0
	replace event=1 if eventtype!="None"
drop eventtype address localsitename 
collapse (mean) mean=arithmeticmean latitude longitude, ///
	by(statecode countycode sitenum parametername datelocal statename)
drop if statename=="Canada"|statename=="Country Of Mexico"|statename=="Puerto Rico"|statename=="Virgin Islands"
isid statecode countycode sitenum parametername datelocal
reshape wide mean latitude longitude, i(statec countyc siten dateloc) j(parametername) string
	//latitude longitude not constant
rename meanco co
	rename meanno2 no2
	rename meanozone ozone
	rename meanso2 so2
	rename meanpm10 pm10
	rename meanpm25 pm25
corr latitude*
corr longitude*
	egen tlatitude=rowmean(latitude*)
	egen tlongitude=rowmean(longitude*)
	drop latitude* longitude*
	rename tlat latitude
	rename tlong longitude
compress
describe
isid statec countyc siten datel
save airquality, replace

//WEATHER
foreach y of numlist 2004/2005{
	insheet using daily_TEMP_`y'.csv, comma clear
	capture destring statecode, force replace
	save daily_TEMP_`y', replace
	insheet using daily_WIND_`y'.csv, comma clear
	capture destring statecode, force replace
	save daily_WIND_`y', replace
}
clear all
foreach y of numlist 2004/2005{
	append using daily_TEMP_`y'
	append using daily_WIND_`y'
}
drop countyname cityname cbsaname datum 
drop sampleduration eventtype address localsitename 
drop if parametername=="Wind Direction - Resultant"
drop pollutantstandard unitsofmeas //knots and fahrenheit
drop observationcount //almost entirely 24
drop aqi methodcode methodname dateoflast
compress
describe
collapse (mean) mean=arithmeticmean latitude longitude, ///
	by(statecode countycode sitenum parametername datelocal statename)
drop if statename=="Canada"|statename=="Country Of Mexico"|statename=="Puerto Rico"|statename=="Virgin Islands"
isid statecode countycode sitenum parametername datelocal
replace parameter="temperature" if parameter=="Outdoor Temperature"
replace parameter="windspeed" if parameter=="Wind Speed - Resultant"
reshape wide mean, i(statec countyc siten dateloc) j(parametername) string
rename meantemperature temperature
rename meanwindspeed windspeed
isid statec countyc siten datel
save temperatureandwind, replace



////////////////////////////////////
//AR(1) REGRESSIONS, NO PRECIPITATION, ALL STATIONS
//////////////////////////////////////
use airquality, clear
merge 1:1 statecode countycode sitenum datelocal using temperatureandwind, nogen
egen id=group(statec countyc siten)
gen date=date(datelocal,"YMD")
xtset id date, daily
gen dow=dow(date)
gen month=month(date)
quietly tab month, gen(MM)
quietly tab dow, gen(DOW)
xi i.month|temperature, pref(_1) noomit //linear impact of temperature varies by calendar month
xi i.month|windspeed, pref(_2) noomit
foreach v in temperature windspeed{
	gen `v'2=`v'^2
	gen `v'3=`v'^3 //cubic in temperature, windspeed
}
gen temperaturewind=temperature*windspeed 
quietly sum date
	local temp=r(mean)
gen date1=date-`temp' //cubic time trend, demeaned
gen date2=date1^2
gen date3=(date1/100)^3 //rescaled
gen week=week(date)
gen year=year(date)
egen weekofsample=group(week year) //clustering might be needed if too unbalanced for newey-west (unless obs count rather than date)

foreach pollutant in co no2 ozone so2 pm10 pm25{
	gen temp=1 if `pollutant'!=. & l.`pollutant'!=. & temperature!=. & windspeed!=.
	bysort id: egen count_`pollutant'=count(temp)
	drop temp
}
sum count_*
foreach pollutant in co no2 ozone so2 pm10 pm25{
	codebook id if count_`pollutant'>=30
}
*heterogeneity 
file open heter using heter_daily_version1.csv, ///
	write text replace
file write heter "id,pollutant,obs,r2,ar1estimate,ar1se" _n
foreach pollutant in co no2 ozone so2 pm10 pm25{
	levelsof id if `pollutant'!=. & temperature!=. & windspeed!=. & count_`pollutant'>=30, local(levels) 
	foreach id of local levels{
	file write heter "`id',`pollutant',"
	quietly reg `pollutant' l.`pollutant' temperature* windspeed* temperaturewind DOW* MM* date1-date3 ///
		if id==`id', vce(cluster weekofsample)
	local obs=e(N)
	local r2=e(r2)
	file write heter "`obs',`r2',"
	quietly lincom l.`pollutant'
	local temp1=r(estimate)
	local temp2=r(se)
	file write heter "`temp1',`temp2'" _n
	}
}
file close heter	

//filling in missing weather:
bysort statec date: egen temp1=mean(temperature)
	bysort statec date: egen temp2=mean(windspeed)
	replace temperature=temp1 if temperature==.
	replace windspeed=temp2 if windspeed==.
	drop temp1 temp2
xtset id date, daily
drop count_* temperature2-temperaturewind
foreach v in temperature windspeed{
	gen `v'2=`v'^2
	gen `v'3=`v'^3 //cubic in temperature, windspeed
}
gen temperaturewind=temperature*windspeed 
foreach pollutant in co no2 ozone so2 pm10 pm25{
	gen temp=1 if `pollutant'!=. & l.`pollutant'!=. & temperature!=. & windspeed!=.
	bysort id: egen count_`pollutant'=count(temp)
	drop temp
}
sum count_*
foreach pollutant in co no2 ozone so2 pm10 pm25{
	codebook id if count_`pollutant'>=30
}
*heterogeneity 
file open heter using heter_daily_version2.csv, ///
	write text replace
file write heter "id,pollutant,obs,r2,ar1estimate,ar1se" _n
foreach pollutant in co no2 ozone so2 pm10 pm25{
	levelsof id if `pollutant'!=. & temperature!=. & windspeed!=. & count_`pollutant'>=30, local(levels) 
	foreach id of local levels{
	file write heter "`id',`pollutant',"
	quietly reg `pollutant' l.`pollutant' temperature* windspeed* temperaturewind DOW* MM* date1-date3 ///
		if id==`id', vce(cluster weekofsample)
	local obs=e(N)
	local r2=e(r2)
	file write heter "`obs',`r2',"
	quietly lincom l.`pollutant'
	local temp1=r(estimate)
	local temp2=r(se)
	file write heter "`temp1',`temp2'" _n
	}
}
file close heter	

//county level:
collapse (mean) co no2 ozone so2 pm10 pm25 temperature windspeed, ///
	by(statec countyc datelocal date* week year weekofsample dow month MM* DOW*)
isid statec countyc date
egen id=group(statec countyc)
xtset id date, daily
foreach v in temperature windspeed{
	gen `v'2=`v'^2
	gen `v'3=`v'^3 //cubic in temperature, windspeed
}
gen temperaturewind=temperature*windspeed 
foreach pollutant in co no2 ozone so2 pm10 pm25{
	gen temp=1 if `pollutant'!=. & l.`pollutant'!=. & temperature!=. & windspeed!=.
	bysort id: egen count_`pollutant'=count(temp)
	drop temp
}
sum count_*
foreach pollutant in co no2 ozone so2 pm10 pm25{
	codebook id if count_`pollutant'>=30
}
*heterogeneity 
file open heter using heter_daily_version3.csv, ///
	write text replace
file write heter "id,pollutant,obs,r2,ar1estimate,ar1se" _n
foreach pollutant in co no2 ozone so2 pm10 pm25{
	levelsof id if `pollutant'!=. & temperature!=. & windspeed!=. & count_`pollutant'>=30, local(levels) 
	foreach id of local levels{
	file write heter "`id',`pollutant',"
	quietly reg `pollutant' l.`pollutant' temperature* windspeed* temperaturewind DOW* MM* date1-date3 ///
		if id==`id', vce(cluster weekofsample)
	local obs=e(N)
	local r2=e(r2)
	file write heter "`obs',`r2',"
	quietly lincom l.`pollutant'
	local temp1=r(estimate)
	local temp2=r(se)
	file write heter "`temp1',`temp2'" _n
	}
}
file close heter	



////////////////////////////////////////////
//PRECIPITATION CONTROLS, SUBSAMPLE OF STATES
///////////////////////////////////////////
//6 biggest states only because computationally burdensome
//"biggest" meaning most pm25 monitors (that's smallest category of the pollutants)
use airquality, clear
bysort statec: egen count=count(pm25)
keep statec staten count
duplicates drop
gsort -count
list statec staten count
//california, pennsylvania, florida, ohio, north carolina, texas
//6 42 12 39 37 48

use airquality, clear
keep statec countyc siten latitude longitude
duplicates drop
isid statec countyc siten
keep if statec==6|statec==12|statec==37|statec==39|statec==42|statec==48
save daily_data_locations, replace

*Next: match to weather data. start with gridded weather data from Schlenker? Vincenty distance?
*I take the monitor locations and match to one of the grids and Roberts and Schlenker
insheet using gridInfo.csv, comma clear
	drop croparea
	//"cross" is very slow, and matching by vincenty within that will also be slow.
	//for simplicity, joinby within a state.
	tostring fips, replace
	gen statecode=substr(fips,1,1) if length(fips)==4
	replace statecode=substr(fips,1,2) if length(fips)==5
	drop fips
	destring, replace	
	keep if statec==6|statec==12|statec==37|statec==39|statec==42|statec==48
	count
	joinby statecode using daily_data_locations, unmatched(none)
	isid gridnumber latitude longitude
	gen longitude_grid = -125 + mod(gridnumber-1,1405)/24
	gen latitude_grid = 49.9375+1/48 - ceil(gridnumber/1405)/24	
vincenty latitude_g longitude_g latitude longitude, vin(distance) 	
	gsort statec countyc sitenum distance
	keep if statec!=statec[_n-1] | countyc!=countyc[_n-1] | siten!=siten[_n-1]
sum distance
isid statec countyc sitenum
keep statec countyc sitenum gridnumber
save monitor_x_gridnumber, replace	
	
foreach s of numlist 6 12 37 39 42 48{
foreach y of numlist 2004(1)2005{
	insheet using state`s'_year`y'.csv, comma clear
	save state`s'_year`y', replace
}
}
clear all
foreach s of numlist 6 12 37 39 42 48{
foreach y of numlist 2004(1)2005{
	append using state`s'_year`y', 
}
}
save statessix_0405, replace

foreach y of numlist 2004/2005{
foreach variable in 42101 42401 42602 44201 81102 88101{
	erase daily_`variable'_`y'.dta
}
}
foreach y of numlist 2004/2005{
	erase daily_TEMP_`y'.dta
	erase daily_WIND_`y'.dta
}
foreach s of numlist 6 12 37 39 42 48{
foreach y of numlist 2004(1)2005{
	erase state`s'_year`y'.dta
}
}


use airquality, clear
merge m:1 statec countyc siten using monitor_x_gridnumber
	keep if _m==3 //34% of observations
	drop _m
gen date=date(datel,"YMD")
gen month=month(date)
gen day=day(date)
gen year=year(date)
merge m:1 gridnumber month day year using statessix_0405
	//expect lots of 3s and 2s, expect no 1s
	//_m==2 not a problem: weather data but no air quality data
	drop if _m==2
	drop _m
merge 1:1 statecode countycode sitenum datelocal using temperatureandwind, ///
	keep(master match) nogen

//comparing epa's temperature (i assume fahrenheit?) to roberts and schlenker temperature (celsius)
gen tmean=(tmin/2+tmax/2)*1.8+32
corr temperature tmean
reg temperature tmean
scatter temperature tmean
//close

egen id=group(statec countyc siten)
xtset id date, daily
gen dow=dow(date)
quietly tab month, gen(MM)
quietly tab dow, gen(DOW)
xi i.month|tmean, pref(_1) noomit //linear impact of temperature varies by calendar month
xi i.month|prec, pref(_2) noomit
foreach v in tmean prec{
	gen `v'2=`v'^2
	gen `v'3=`v'^3 //cubic in temperature, windspeed
}
gen tmeanprec=tmean*prec 
quietly sum date
	local temp=r(mean)
gen date1=date-`temp' //cubic time trend, demeaned
gen date2=date1^2
gen date3=(date1/100)^3 //rescaled
gen week=week(date)
egen weekofsample=group(week year) //clustering might be needed if too unbalanced for newey-west (unless obs count rather than date)

foreach pollutant in co no2 ozone so2 pm10 pm25{
	gen temp=1 if `pollutant'!=. & l.`pollutant'!=. & tmean!=. & prec!=.
	bysort id: egen count_`pollutant'=count(temp)
	drop temp
}
sum count_*
foreach pollutant in co no2 ozone so2 pm10 pm25{
	codebook id if count_`pollutant'>=30
}
*heterogeneity 
file open heter using heter_daily_version4.csv, ///
	write text replace
file write heter "id,pollutant,obs,r2,ar1estimate,ar1se" _n
foreach pollutant in co no2 ozone so2 pm10 pm25{
	levelsof id if `pollutant'!=. & tmean!=. & prec!=. & count_`pollutant'>=30, local(levels) 
	foreach id of local levels{
	file write heter "`id',`pollutant',"
	quietly reg `pollutant' l.`pollutant' tmean* prec* tmeanprec DOW* MM* date1-date3 ///
		if id==`id', vce(cluster weekofsample)
	local obs=e(N)
	local r2=e(r2)
	file write heter "`obs',`r2',"
	quietly lincom l.`pollutant'
	local temp1=r(estimate)
	local temp2=r(se)
	file write heter "`temp1',`temp2'" _n
	}
}
file close heter		
	
//filling in windspeed
bysort statec date: egen temp2=mean(windspeed)
	replace windspeed=temp2 if windspeed==.
	drop temp2
xtset id date, daily
foreach v in windspeed{
	gen `v'2=`v'^2
	gen `v'3=`v'^3 //cubic in temperature, windspeed
}
gen tmeanwind=tmean*windspeed 
drop count_*
foreach pollutant in co no2 ozone so2 pm10 pm25{
	gen temp=1 if `pollutant'!=. & l.`pollutant'!=. & tmean!=. & windspeed!=.
	bysort id: egen count_`pollutant'=count(temp)
	drop temp
}
sum count_*
foreach pollutant in co no2 ozone so2 pm10 pm25{
	codebook id if count_`pollutant'>=30
}
*heterogeneity 
file open heter using heter_daily_version5.csv, ///
	write text replace
file write heter "id,pollutant,obs,r2,ar1estimate,ar1se" _n
foreach pollutant in co no2 ozone so2 pm10 pm25{
	levelsof id if `pollutant'!=. & temperature!=. & windspeed!=. & count_`pollutant'>=30, local(levels) 
	foreach id of local levels{
	file write heter "`id',`pollutant',"
	quietly reg `pollutant' l.`pollutant' tmean* prec* windspeed* tmeanprec tmeanwind DOW* MM* date1-date3 ///
		if id==`id', vce(cluster weekofsample)
	local obs=e(N)
	local r2=e(r2)
	file write heter "`obs',`r2',"
	quietly lincom l.`pollutant'
	local temp1=r(estimate)
	local temp2=r(se)
	file write heter "`temp1',`temp2'" _n
	}
}
file close heter	

//results
insheet using heter_daily_version1.csv, comma clear names
table pollutant, contents(mean ar1est mean ar1se n obs mean obs mean r2 )
insheet using heter_daily_version5.csv, comma clear names
table pollutant, contents(mean ar1est mean ar1se n obs mean obs mean r2 )

//not used:
insheet using heter_daily_version2.csv, comma clear names
table pollutant, contents(mean ar1est mean ar1se n obs mean obs mean r2 )
insheet using heter_daily_version3.csv, comma clear names
table pollutant, contents(mean ar1est mean ar1se n obs mean obs mean r2 )
insheet using heter_daily_version4.csv, comma clear names
table pollutant, contents(mean ar1est mean ar1se n obs mean obs mean r2 )