*********************************************************************** ** * STATA TUTORIAL USING 2000 CENSUS DATA FROM CALIFORNIA * CENSUS DATA FILE HAS ONLY 600 OBSERVATIONS TO HAVE A * MANAGEABLE DATASET * by Rocio Titiunik * Jan 31st, 2008 *********************************************************************** /***************** * Websites with STATA tutorials http://www.princeton.edu/~erp/stata/main.html http://data.princeton.edu/stata/ http://www.cpc.unc.edu/services/computer/presentations/statatutorial http://www.bsos.umd.edu/gvpt/uslaner/statatut.pdf ******************/ * These initial commands should be in ALL your do files; #delimit; clear; cap log close; set more off; set mem 800m; set matsize 300; program drop _all; * Example of a global (don't worry too much about it now); global ca_dofiles "~/work/__Papers/Jas/Redistricting/stata/California"; ******************* Prepare the files that I will use in this tutorial code *******************; *do "prepare.census.data.do"; /*************************************************** Read a dataset into Stata 'use' if dataset is in Stata format 'insheet' if dataset is CSV or space-separated 'infix' if dataset is fixed-format ****************************************************/ ******************************************************************************** WORK WITH CENSUS DATA SUMMARY FILE 1 ********************************************************************************; /************************************************************ GEOFILE (fixed-column file ==> this is the only file with census unit ID variables) *************************************************************/ use "./geofile_ca_orig.dta", clear; * To keep a subset of observations use 'keep'; * keep summary level 101 ==> get block level dataset; keep if sumlev=="101"; keep if geocomp=="00"; * To transform from string to numeric, use 'destring'; destring logrecno, replace; * To sort by one or more variables, use 'sort'; sort logrecno; * To save a file (in Stata format) , use 'save'; save "./geofile_ca.dta", replace; /************************************************************ Open file 01 ==> Get total population, population by race, population urban and rural *************************************************************/ * Open file 01; use "./sf1_file01_ca_orig.dta", clear; do "$ca_dofiles/00_col_names_SF1_file01.do"; * To rename a variable, use 'rename'; rename P001001 pop_total; rename P002002 pop_urban; rename P002005 pop_rural; rename P005001 pop_18above; * To generate new variables, use 'gen'; /* RACE and ETHNICITY of Population 18 and above*/ gen pop_white18 =P005003; gen pop_black18 =P005004; gen pop_amerindian18 =P005005; gen pop_asian18 =P005006; gen pop_pacific18 =P005007; gen pop_otherace18 =P005008; gen pop_2moreraces18 =P005009; * To drop variables you don't want anymore, use 'drop'; drop P00*; * To drop observations, use 'drop if'; drop if pop_white18 < 0; sort logrecno; save "./sf1_file01_ca.dta", replace; /************************************************************ Open file 02==> Get population by sex and age, households, household size, household type *************************************************************/ * Open file 02; use "./sf1_file02_ca_orig.dta", clear; * Put column names; do "$ca_dofiles/00_col_names_SF1_file02.do"; /* RACE and ETHNICITY */ rename P007002 pop_white; * White alone; rename P007003 pop_black; * Black alone; rename P007004 pop_amerindian; * American Indian and Alaska Native alone; rename P007005 pop_asian; * Asian alone; rename P007006 pop_pacific; * Native Hawaiian and Other Pacific Islander alone; rename P007007 pop_otherace; * Some other race alone; rename P007008 pop_2moreraces; * Two or more races; rename P008002 pop_nohispanic; * Not Hispanic or Latino; rename P008003 pop_nohispanic_white; * Not Hispanic and white; rename P008010 pop_hispanic; * Hispanic or Latino; * Hispanic of Latino of 18 and above; gen pop_hispanic18 =P006002; /* AGE BY SEX - Universe: Total population */ rename P012001 pop_total2; rename P012002 pop_male; rename P012003 pop_male0_to_5; rename P012004 pop_male5_to_9; rename P012005 pop_male10_to_14; rename P012006 pop_male15_to_17; rename P012007 pop_male18_to_19; rename P012008 pop_male20; rename P012009 pop_male21; rename P012010 pop_male22_to_24; rename P012011 pop_male25_to_29; rename P012012 pop_male30_to_34; rename P012013 pop_male35_to_39; rename P012014 pop_male40_to_44; rename P012015 pop_male45_to_49; rename P012016 pop_male50_to_54; rename P012017 pop_male55_to_59; rename P012018 pop_male60_to_61; rename P012019 pop_male62_to_64; rename P012020 pop_male65_to_66; rename P012021 pop_male67_to_69; rename P012022 pop_male70_to_74; rename P012023 pop_male75_to_79; rename P012024 pop_male80_to_84; rename P012025 pop_male85plus; rename P012026 pop_fem; rename P012027 pop_fem0_to_5; rename P012028 pop_fem5_to_9; rename P012029 pop_fem10_to_14; rename P012030 pop_fem15_to_17; rename P012031 pop_fem18_to_19; rename P012032 pop_fem20; rename P012033 pop_fem21; rename P012034 pop_fem22_to_24; rename P012035 pop_fem25_to_29; rename P012036 pop_fem30_to_34; rename P012037 pop_fem35_to_39; rename P012038 pop_fem40_to_44; rename P012039 pop_fem45_to_49; rename P012040 pop_fem50_to_54; rename P012041 pop_fem55_to_59; rename P012042 pop_fem60_to_61; rename P012043 pop_fem62_to_64; rename P012044 pop_fem65_to_66; rename P012045 pop_fem67_to_69; rename P012046 pop_fem70_to_74; rename P012047 pop_fem75_to_79; rename P012048 pop_fem80_to_84; rename P012049 pop_fem85plus; /**************************************** STRUCTURE OF 'foreach' LOOP: foreach j in ....... { commands using `j' } ****************************************/ * Example using real data; foreach x in male fem {; gen pop_`x'15_to_19 = pop_`x'15_to_17+pop_`x'18_to_19; gen pop_`x'20_to_24 = pop_`x'20 + pop_`x'21+pop_`x'22_to_24; gen pop_`x'60_to_64 = pop_`x'60_to_61+pop_`x'62_to_64; gen pop_`x'65_to_69 = pop_`x'65_to_66+pop_`x'67_to_69; drop pop_`x'15_to_17 pop_`x'18_to_19 pop_`x'20 pop_`x'21 pop_`x'22_to_24 pop_`x'60_to_61 pop_`x'62_to_64 pop_`x'65_to_66 pop_`x'67_to_69; }; /*HOUSEHOLDS [1]*/ rename P015001 hh_tot; rename P016001 pop_in_hh; rename P017001 avera_hh_size; drop P0*; sort logrecno; save "./sf1_file02_ca.dta", replace; /************************************************************ VERY IMPORTANT COMMAND: 'merge' ==> combines two or more datasets *************************************************************/ use "./geofile_ca.dta", clear; sort logrecno; merge logrecno using "./sf1_file01_ca.dta"; tab _merge; drop if _merge==2; drop _merge; sort logrecno; merge logrecno using "./sf1_file02_ca.dta"; tab _merge; drop if _merge==2; drop _merge; * command 'bys' creates variables by block ==> See examples below; * Generate proportion of population by block-group; bys state county tract blkgrp: egen pop_blkgr=sum(pop_total); gen pop_per_blkgr=pop_total/pop_blkgr; * to replace some values for others in a given variable, use 'replace'; replace pop_per_blkgr=0 if pop_blkgr==0; * put zeros to the proportion for those blocks that are in block groups with zero population; * to create a label for a variable, use 'label'; label var pop_per_blkgr "Percent of total block group population in the block"; * Generate proportion of households by block group; bys state county tract blkgrp: egen hh_blkgr=sum(hh_tot); gen hh_per_blkgr=hh_tot/hh_blkgr; replace hh_per_blkgr=0 if hh_blkgr==0; * put zeros to the proportion for those blocks that are in block groups with zero households; label var hh_per_blkgr "Percent of total block group households in the block"; save "./census2000.dta", replace; /************************************************************ SUMMARY FILE 3 *************************************************************/ /************************************************************ Open the fixed-column geo file ==> this is the only file with census unit ID variables *************************************************************/ use "./geofilesf3_ca_orig.dta", clear; * keep summary level 150 ==> get block group level dataset; keep if sumlev=="150"; keep if geocomp=="00"; destring logrecno, replace; foreach file in 02 03 04 06 07 08 {; sort logrecno; merge logrecno using "./sf3_file`file'_ca_orig", _merge(merge_`file'); tab merge_`file'; keep if merge_`file' == 3; drop merge_`file'; }; /*LANGUAGE SPOKEN AT HOME FOR POPULATION 5 YEARS AND OVER */ /*Universe: Population 5 years and over*/ egen blkgr_pop_5_speak_engl =rsum(P019003 P019025 P019047); egen blkgr_pop_5_speak_span =rsum(P019004 P019026 P019048); egen blkgr_pop_5_speak_indo =rsum(P019009 P019031 P019053); egen blkgr_pop_5_speak_asianpac =rsum(P019014 P019036 P019058); egen blkgr_pop_5_speak_other =rsum(P019019 P019041 P019063); rename P021001 blkgr_sf3_pop_tot; rename P021003 blkgr_pop_native_born_ca; rename P021004 blkgr_pop_native_born_usstate; rename P021009 blkgr_pop_native_born_outus; rename P021014 blkgr_pop_foreign_naturcit; rename P021015 blkgr_pop_foreign_nocit; gen blkgr_pop_citizen=P021002+P021013-blkgr_pop_foreign_nocit; gen blkgr_pop_citizen_test=blkgr_pop_native_born_ca+blkgr_pop_native_born_usstate+blkgr_pop_native_born_outus +blkgr_pop_foreign_naturcit; /* P021001='Total:' P021002=' Native:' P021003=' Born in state of residence' P021004=' Born in other state in the United States:' P021005=' Northeast' P021006=' Midwest' P021007=' South' P021008=' West' P021009=' Born outside the United States:' P021010=' Puerto Rico' P021011=' U.S. Island Areas' P021012=' Born abroad of American parent(s)' P021013=' Foreign born:' P021014=' Naturalized citizen' P021015=' Not a citizen' */ /*SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER [35]*/ /*Universe: Population 25 years and over*/ egen blkgr_pop_25_0to4grade =rsum(P037003 P037004 P037020 P037021); egen blkgr_pop_25_5to8grade =rsum(P037005 P037006 P037022 P037023); egen blkgr_pop_25_9to11grade =rsum(P037007 P037008 P037009 P037010 P037024 P037025 P037026 P037027); * includes 12 grade but no highschool diploma; egen blkgr_pop_25_highschool =rsum(P037011 P037012 P037028 P037029); * highschool diploma: includes people who attended college for less than one year; egen blkgr_pop_25_1colnode =rsum(P037013 P037030); * attended college for at least one year but has no degree; egen blkgr_pop_25_bachelor =rsum(P037014 P037015 P037031 P037032); * bachelor degree or associate degree; egen blkgr_pop_25_graduate =rsum(P037016 P037017 P037018 P037033 P037034 P037035); * graduate level degree (master, professional school or doctorate degree); /* P037001 'Total:' P037002 ' Male:' P037003 ' No schooling completed' P037004 ' Nursery to 4th grade' P037005 ' 5th and 6th grade' P037006 ' 7th and 8th grade' P037007 ' 9th grade' P037008 ' 10th grade' P037009 ' 11th grade' P037010 "' 12th grade, no diploma'" P037011 ' High school graduate (includes equivalency)' P037012 "' Some college, less than 1 year'" P037013 "' Some college, 1 or more years, no degree'" P037014 ' Associate degree' P037015 ' Bachelor's degree' P037016 ' Master's degree' P037017 ' Professional school degree' P037018 ' Doctorate degree' P037019 ' Female:' P037020 ' No schooling completed' P037021 ' Nursery to 4th grade' P037022 ' 5th and 6th grade' P037023 ' 7th and 8th grade' P037024 ' 9th grade' P037025 ' 10th grade' P037026 ' 11th grade' P037027 "' 12th grade, no diploma'" P037028 ' High school graduate (includes equivalency)' P037029 "' Some college, less than 1 year'" P037030 "' Some college, 1 or more years, no degree'" P037031 ' Associate degree' P037032 ' Bachelor's degree' P037033 ' Master's degree' P037034 ' Professional school degree' P037035 ' Doctorate degree' ; */ /*SEX BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER [15]*/ /*Universe: Population 16 years and over*/ egen blkgr_pop_16_nolabfor =rsum(P043008 P043015); egen blkgr_pop_16_inlabfor_all =rsum(P043003 P043010); egen blkgr_pop_16_inlabfor_arf =rsum(P043004 P043011); egen blkgr_pop_16_employed =rsum(P043006 P043013); egen blkgr_pop_16_unemployed =rsum(P043007 P043014); /* P043001='Total:' P043002=' Male:' P043003=' In labor force:' P043004=' In Armed Forces' P043005=' Civilian:' P043006=' Employed' P043007=' Unemployed' P043008=' Not in labor force' P043009=' Female:' P043010=' In labor force:' P043011=' In Armed Forces' P043012=' Civilian:' P043013=' Employed' P043014=' Unemployed' P043015=' Not in labor force' */ /*HOUSEHOLD INCOME IN 1999 [17]*/ /*Universe: Households*/ rename P052001 blkgr_sf3_hh_tot; egen blkgr_hh_income99_0to19 =rsum(P052002 P052003 P052004); egen blkgr_hh_income99_20to39 =rsum(P052005 P052006 P052007 P052008); egen blkgr_hh_income99_40to59 =rsum(P052009 P052010 P052011); egen blkgr_hh_income99_60to74 =rsum(P052012); egen blkgr_hh_income99_75to99 =rsum(P052013); egen blkgr_hh_income99_100to199 =rsum(P052014 P052015 P052016); egen blkgr_hh_income99_200 =rsum(P052017); /* P052001='Total:' P052002=' Less than $10,000' P052003=' $10,000 to $14,999' P052004=' $15,000 to $19,999' P052005=' $20,000 to $24,999' P052006=' $25,000 to $29,999' P052007=' $30,000 to $34,999' P052008=' $35,000 to $39,999' P052009=' $40,000 to $44,999' P052010=' $45,000 to $49,999' P052011=' $50,000 to $59,999' P052012=' $60,000 to $74,999' P052013=' $75,000 to $99,999' P052014=' $100,000 to $124,999' P052015=' $125,000 to $149,999' P052016=' $150,000 to $199,999' P052017=' $200,000 or more' */ /*MEDIAN HOUSEHOLD INCOME IN 1999 (DOLLARS) [1]*/ /*Universe: Households*/ rename P053001 blkgr_hh_income99_median; /* P053001='Median household income in 1999' */ /*AGGREGATE HOUSEHOLD INCOME IN 1999 (DOLLARS) [3]*/ /*Universe: Households*/ rename P054001 blkgr_hh_income99_aggregate; /* P054001='Aggregate household income in 1999:' P054002=' Less than $200,000' P054003=' $200,000 or more' */ drop P0* P1*; tempfile tempblkgr; sort state county tract blkgrp; save tempblkgr, replace; /* Open the census data file at the *block* level */ use "./census2000.dta", clear; sort state county tract blkgrp; /* Merge the census data file at the *block-group* level */ merge state county tract blkgrp using tempblkgr; tab _merge; keep if _merge==3; drop _merge ; keep fileid stusab sumlev geocomp chariter cifsn logrecno state county countysc tract blkgrp block pop100 pop* hh* blkgr*; * Check; gen temp=blkgr_sf3_pop_tot-pop_blkgr; sum temp;drop temp; gen temp=blkgr_sf3_hh_tot-hh_blkgr; sum temp;drop temp; * Note: the population and households by blockgroup in SF3 is different from the population by blockgroup in SF3 due to error (SF3 is a survey * not a census); /* TAKE ALL BLOCK-GROUP LEVEL VARIABLES TO BLOCK LEVEL */ * Individuals; foreach var in sf3_pop_tot pop_native_born_ca pop_native_born_usstate pop_native_born_outus pop_foreign_naturcit pop_foreign_nocit pop_citizen pop_5_speak_engl pop_5_speak_span pop_5_speak_indo pop_5_speak_asianpac pop_5_speak_other pop_25_0to4grade pop_25_5to8grade pop_25_9to11grade pop_25_highschool pop_25_1colnode pop_25_bachelor pop_25_graduate pop_16_nolabfor pop_16_inlabfor_all pop_16_inlabfor_arf pop_16_employed pop_16_unemployed {; gen `var'=blkgr_`var'*pop_per_blkgr; }; * Households; foreach var in sf3_hh_tot hh_income99_0to19 hh_income99_20to39 hh_income99_40to59 hh_income99_60to74 hh_income99_75to99 hh_income99_100to199 hh_income99_200 {; gen `var'=blkgr_`var'*hh_per_blkgr; }; * Generate population of citizens above 18 (VEP); gen pop_citizen18=pop_citizen*(pop_18above/pop_total); order fileid stusab sumlev geocomp chariter cifsn logrecno state county countysc tract blkgrp block pop100 sf3* pop* hh* blkgr*; * Prepare blockkey variable to merge to election data; gen blockkey=state+county+tract+block; compress; sort blockkey; save "./census2000.dta", replace; /* THIS FILE CONTAINS 2000 CENSUS INFORMATION AT THE CENSUS BLOCK LEVEL AND IT IS READY TO MERGE TO THE ELECTION DATA BY CENSUS BLOCK*/ /************************** REMOVE ALL TEMPFILES BEFORE EXITING *************************/ !\rm -fr temp*.dta;