####################################################################### ## # R version of code "intro.stata.do" # USES 2000 CENSUS DATA FROM CALIFORNIA # CENSUS DATA FILE HAS ONLY 600 OBSERVATIONS TO HAVE A # MANAGEABLE DATASET # by Rocio Titiunik # Jan 31st, 2008 ####################################################################### library(foreign) install.packages("gregmisc", dependencies=TRUE) library(gdata) # Note: before starting, execute do file: prepare.census.forR.do ########################################### #WORK WITH CENSUS DATA #SUMMARY FILE 1 ########################################### ############################################################# # GEOFILE (fixed-column file ==> this is the only file with census unit ID variables) ############################################################## geo <- read.dta ("./geofile_ca_orig.dta") dim(geo) names(geo) # keep a subset of observations : use indexing # keep summary level 101 ==> get block level dataset; geo <- geo [geo$sumlev == "101",] geo <- geo [geo$geocomp == "00",] dim(geo) # To transform from string to numeric, use 'as.numeric' geo$logrecno <- as.numeric(geo$logrecno) # To sort by one or more variables, use 'sort'; geo <- geo [order(geo$logrecno),] geo$logrecno [1:10] ############################################################# # Work with file 01 ==> Get total population, population by race, population urban and rural ############################################################## file1 <- read.dta("./sf1_file01_ca_R.dta") dim(file1) names(file1) # To rename variables # rename.vars(file1, from=c("P001001","P002002","P002005","P005001"), to = c("pop_total","pop_urban","pop_rural","pop_18above"),info=FALSE) # does not work indx <- (names(file1)=="P001001" | names(file1)=="P002002" | names(file1)=="P002005" | names(file1)=="P005001") table(indx) names(file1)[indx] <- c("pop_total","pop_urban","pop_rural","pop_18above") names(file1) # To generate new variables ## RACE and ETHNICITY of Population 18 and above## file1$pop_white18 <- file1$P005003; file1$pop_black18 <- file1$P005004; file1$pop_amerindian18 <- file1$P005005; file1$pop_asian18 <- file1$P005006; file1$pop_pacific18 <- file1$P005007; file1$pop_otherace18 <- file1$P005008; file1$pop_2moreraces18 <- file1$P005009; # To drop variables you don't want anymore # sets to keep vars <- c(1:6,8,11,157,229:234) file1 <- file1[,vars] dim(file1) names(file1) # To drop observations file1 <- file1[file1$pop_white >=0,] dim(file1) file1 <- file1[order(file1$logrecno),] ############################################################# # Open file 02==> Get population by sex and age, households, household size, household type ############################################################## # Open file 02; file2 <- read.dta("./sf1_file02_ca_R.dta") dim(file2) ## RACE and ETHNICITY ## names(file2)[names(file2)=="P007002"] <-"pop_white" # White alone; names(file2)[names(file2)=="P007003"] <-"pop_black" # Black alone; names(file2)[names(file2)=="P007004"] <-"pop_amerindian" # American Indian and Alaska Native alone; names(file2)[names(file2)=="P007005"] <-"pop_asian" # Asian alone; names(file2)[names(file2)=="P007006"] <-"pop_pacific" # Native Hawaiian and Other Pacific Islander alone; names(file2)[names(file2)=="P007007"] <-"pop_otherace" # Some other race alone; names(file2)[names(file2)=="P007008"] <-"pop_2moreraces" # Two or more races; names(file2)[names(file2)=="P008002"] <-"pop_nohispanic" # Not Hispanic or Latino; names(file2)[names(file2)=="P008003"] <-"pop_nohispanic_white" # Not Hispanic and white; names(file2)[names(file2)=="P008010"] <-"pop_hispanic" # Hispanic or Latino; # Hispanic of Latino of 18 and above; file2$pop_hispanic18 <- file2$P006002; ## AGE BY SEX - Universe: Total population ## names(file2)[names(file2)=="P012001"] <- "pop_total2" names(file2)[names(file2)=="P012002"] <- "pop_male" names(file2)[names(file2)=="P012003"] <- "pop_male0_to_5" names(file2)[names(file2)=="P012004"] <- "pop_male5_to_9" names(file2)[names(file2)=="P012005"] <- "pop_male10_to_14" names(file2)[names(file2)=="P012006"] <- "pop_male15_to_17" names(file2)[names(file2)=="P012007"] <- "pop_male18_to_19" names(file2)[names(file2)=="P012008"] <- "pop_male20" names(file2)[names(file2)=="P012009"] <- "pop_male21" names(file2)[names(file2)=="P012010"] <- "pop_male22_to_24" names(file2)[names(file2)=="P012011"] <- "pop_male25_to_29" names(file2)[names(file2)=="P012012"] <- "pop_male30_to_34" names(file2)[names(file2)=="P012013"] <- "pop_male35_to_39" names(file2)[names(file2)=="P012014"] <- "pop_male40_to_44" names(file2)[names(file2)=="P012015"] <- "pop_male45_to_49" names(file2)[names(file2)=="P012016"] <- "pop_male50_to_54" names(file2)[names(file2)=="P012017"] <- "pop_male55_to_59" names(file2)[names(file2)=="P012018"] <- "pop_male60_to_61" names(file2)[names(file2)=="P012019"] <- "pop_male62_to_64" names(file2)[names(file2)=="P012020"] <- "pop_male65_to_66" names(file2)[names(file2)=="P012021"] <- "pop_male67_to_69" names(file2)[names(file2)=="P012022"] <- "pop_male70_to_74" names(file2)[names(file2)=="P012023"] <- "pop_male75_to_79" names(file2)[names(file2)=="P012024"] <- "pop_male80_to_84" names(file2)[names(file2)=="P012025"] <- "pop_male85plus" names(file2)[names(file2)=="P012026"] <- "pop_fem" names(file2)[names(file2)=="P012027"] <- "pop_fem0_to_5" names(file2)[names(file2)=="P012028"] <- "pop_fem5_to_9" names(file2)[names(file2)=="P012029"] <- "pop_fem10_to_14" names(file2)[names(file2)=="P012030"] <- "pop_fem15_to_17" names(file2)[names(file2)=="P012031"] <- "pop_fem18_to_19" names(file2)[names(file2)=="P012032"] <- "pop_fem20" names(file2)[names(file2)=="P012033"] <- "pop_fem21" names(file2)[names(file2)=="P012034"] <- "pop_fem22_to_24" names(file2)[names(file2)=="P012035"] <- "pop_fem25_to_29" names(file2)[names(file2)=="P012036"] <- "pop_fem30_to_34" names(file2)[names(file2)=="P012037"] <- "pop_fem35_to_39" names(file2)[names(file2)=="P012038"] <- "pop_fem40_to_44" names(file2)[names(file2)=="P012039"] <- "pop_fem45_to_49" names(file2)[names(file2)=="P012040"] <- "pop_fem50_to_54" names(file2)[names(file2)=="P012041"] <- "pop_fem55_to_59" names(file2)[names(file2)=="P012042"] <- "pop_fem60_to_61" names(file2)[names(file2)=="P012043"] <- "pop_fem62_to_64" names(file2)[names(file2)=="P012044"] <- "pop_fem65_to_66" names(file2)[names(file2)=="P012045"] <- "pop_fem67_to_69" names(file2)[names(file2)=="P012046"] <- "pop_fem70_to_74" names(file2)[names(file2)=="P012047"] <- "pop_fem75_to_79" names(file2)[names(file2)=="P012048"] <- "pop_fem80_to_84" names(file2)[names(file2)=="P012049"] <- "pop_fem85plus" # for loop # the following is what I find least convenient about data management in R: looping over variable names is difficult # original STATA code #foreach x in male fem{; # gen pop_`x'15_to_19=pop_`x'15_to_17+pop_`x'18_to_19; # gen pop_`x'20_to_24=pop_`x'20+pop_`x'21+pop_`x'22_to_24; # gen pop_`x'60_to_64=pop_`x'60_to_61+pop_`x'62_to_64; # gen pop_`x'65_to_69=pop_`x'65_to_66+pop_`x'67_to_69; # drop pop_`x'15_to_17 pop_`x'18_to_19 pop_`x'20 pop_`x'21 pop_`x'22_to_24 # pop_`x'60_to_61 pop_`x'62_to_64 pop_`x'65_to_66 pop_`x'67_to_69; #}; file2$pop_male15_to_19 <- file2$pop_male15_to_17 + file2$pop_male18_to_19; file2$pop_male20_to_24 <- file2$pop_male20 + file2$pop_male21 + file2$pop_male22_to_24; file2$pop_male60_to_64 <- file2$pop_male60_to_61 + file2$pop_male62_to_64; file2$pop_male65_to_69 <- file2$pop_male65_to_66 + file2$pop_male67_to_69; file2$pop_fem15_to_19 <- file2$pop_fem15_to_17 + file2$pop_fem18_to_19; file2$pop_fem20_to_24 <- file2$pop_fem20 + file2$pop_fem21 + file2$pop_fem22_to_24; file2$pop_fem60_to_64 <- file2$pop_fem60_to_61 + file2$pop_fem62_to_64; file2$pop_fem65_to_69 <- file2$pop_fem65_to_66 + file2$pop_fem67_to_69; ##HOUSEHOLDS [1]## names(file2) [names(file2)=="P015001"] <- "hh_tot"; names(file2) [names(file2)=="P016001"] <- "pop_in_hh"; names(file2) [names(file2)=="P017001"] <- "avera_hh_size"; file2 <- file2[order(file2$logrecno),] ############################################################# # VERY IMPORTANT COMMAND: 'merge' ==> combines two or more datasets ############################################################## dim(geo) dim(file1) census <- merge(geo,file1,by.x="logrecno",by.y="logrecno",all=FALSE) dim(census) names(census) census <- merge(census,file2,by.x="logrecno",by.y="logrecno",all=FALSE) names(census) # command 'bys' creates variables by block ==> See examples below; # Generate proportion of population by block-group; res <- aggregate.data.frame(census$pop_total, by=list(census$state,census$county,census$tract,census$blkgrp),sum) names(res) <- c("state","county","tract","blkgrp","pop_blkgr") dim(res) dim(census) census <- merge(census,res,by.x=c("state","county","tract","blkgrp"),by.y=c("state","county","tract","blkgrp"),all=FALSE) dim(census) census[1:10,567] census$pop_per_blkgr <- census$pop_total/census$pop_blkgr; # to replace some values for others in a given variable, use indexing census$pop_per_blkgr[census$pop_blkgr ==0 ] <- 0 # put zeros to the proportion for those blocks that are in block groups with zero population; # Generate proportion of households by block group; res <- aggregate.data.frame(census$hh_tot,by=list(census$state,census$county,census$tract,census$blkgrp),sum) names(res) <- c("state","county","tract","blkgrp","hh_blkgr") dim(census) census <- merge(census,res,by.x=c("state","county","tract","blkgrp"),by.y=c("state","county","tract","blkgrp"),all=FALSE) dim(census) census[1:10,569] census$hh_per_blkgr <- census$hh_tot/census$hh_blkgr; census$hh_per_blkgr[census$hh_blkgr ==0] <- 0 # put zeros to the proportion for those blocks that are in block groups with zero households; ############################################################# # SUMMARY FILE 3 ############################################################## ############################################################# # Open the fixed-column geo file ==> this is the only file with census unit ID variables ############################################################## geo <- read.dta ("./geofile_ca_orig.dta") dim(geo) names(geo) # keep summary level 150 ==> get block group level dataset; geo <- geo [geo$sumlev == "150",] geo <- geo [geo$geocomp == "00",] dim(geo) geo$logrecno <- as.numeric(geo$logrecno) # To sort by one or more variables, use 'order'; geo <- geo [order(geo$logrecno),] # Open SF3 files sf3.file2 <- read.dta("./sf3_file02_ca_orig.dta") sf3.file3 <- read.dta("./sf3_file03_ca_orig.dta") sf3.file4 <- read.dta("./sf3_file04_ca_orig.dta") sf3.file6 <- read.dta("./sf3_file06_ca_orig.dta") sf3.file7 <- read.dta("./sf3_file07_ca_orig.dta") sf3.file8 <- read.dta("./sf3_file08_ca_orig.dta") dim(geo) sf3 <- merge(geo,sf3.file2,by.x="logrecno",by.y="logrecno",all=FALSE) sf3 <- merge(sf3,sf3.file3,by.x="logrecno",by.y="logrecno",all=FALSE) sf3 <- merge(sf3,sf3.file4,by.x="logrecno",by.y="logrecno",all=FALSE) sf3 <- merge(sf3,sf3.file6,by.x="logrecno",by.y="logrecno",all=FALSE) sf3 <- merge(sf3,sf3.file7,by.x="logrecno",by.y="logrecno",all=FALSE) sf3 <- merge(sf3,sf3.file8,by.x="logrecno",by.y="logrecno",all=FALSE) dim(sf3) ##LANGUAGE SPOKEN AT HOME FOR POPULATION 5 YEARS AND OVER ## ##Universe: Population 5 years and over## sf3$blkgr_pop_5_speak_engl <- sf3$P019003 + sf3$P019025 + sf3$P019047 sf3$blkgr_pop_5_speak_span <- sf3$P019004 + sf3$P019026 + sf3$P019048 sf3$blkgr_pop_5_speak_indo <- sf3$P019009 + sf3$P019031 + sf3$P019053 sf3$blkgr_pop_5_speak_asianpac <- sf3$P019014 + sf3$P019036 + sf3$P019058 sf3$blkgr_pop_5_speak_other <- sf3$P019019 + sf3$P019041 + sf3$P019063 names(sf3)[names(sf3)=="P021001"] <- "blkgr_sf3_pop_tot"; names(sf3)[names(sf3)=="P021003"] <- "blkgr_pop_native_born_ca" names(sf3)[names(sf3)=="P021004"] <- "blkgr_pop_native_born_usstate" names(sf3)[names(sf3)=="P021009"] <- "blkgr_pop_native_born_outus" names(sf3)[names(sf3)=="P021014"] <- "blkgr_pop_foreign_naturcit" names(sf3)[names(sf3)=="P021015"] <- "blkgr_pop_foreign_nocit" sf3$blkgr_pop_citizen <- sf3$P021002 + sf3$P021013 - sf3$blkgr_pop_foreign_nocit; sf3$blkgr_pop_citizen_test <- (sf3$blkgr_pop_native_born_ca+sf3$blkgr_pop_native_born_usstate+sf3$blkgr_pop_native_born_outus +sf3$blkgr_pop_foreign_naturcit); ##SEX BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 YEARS AND OVER [35]## ##Universe: Population 25 years and over## sf3$blkgr_pop_25_0to4grade <- +sf3$037003 +sf3$037004 +sf3$037020 +sf3$037021 sf3$blkgr_pop_25_5to8grade <- sf3$037005 +sf3$037006 +sf3$037022 +sf3$037023 sf3$blkgr_pop_25_9to11grade <- sf3$037007 +sf3$037008 +sf3$037009 +sf3$037010 +sf3$037024 +sf3$037025 +sf3$037026 +sf3$037027 sf3$blkgr_pop_25_highschool <- sf3$037011 +sf3$037012 +sf3$037028 +sf3$037029 sf3$blkgr_pop_25_1colnode <- sf3$037013 +sf3$037030 sf3$blkgr_pop_25_bachelor <- sf3$037014 +sf3$037015 +sf3$037031 +sf3$037032 sf3$blkgr_pop_25_graduate <- sf3$037016 +sf3$037017 +sf3$037018 +sf3$037033 +sf3$037034 +sf3$037035 ## Merge the census data file at the #block-group# level to the census file at the block level dim(sf3) dim(census) census.final <- merge(census,sf3,by.x=c("state","county","tract","blkgrp"),by.y==c("state","county","tract","blkgrp"),all=FALSE) dim(census.final) # This part will be updated next class for (i in c()) census.final[[,i]] <- census.final[[,i]] * census.final$pop_per_blkgr for (i in c()) census.final[[,i]] <- census.final[[,i]] * census.final$hh_per_blkgr sf3_pop_tot pop_native_born_ca pop_native_born_usstate pop_native_born_outus pop_foreign_naturcit pop_foreign_nocit pop_citizen pop_5_speak_engl pop_5_speak_span pop_5_speak_indo pop_5_speak_asianpac pop_5_speak_other pop_25_0to4grade pop_25_5to8grade pop_25_9to11grade pop_25_highschool pop_25_1colnode pop_25_bachelor pop_25_graduate pop_16_nolabfor pop_16_inlabfor_all pop_16_inlabfor_arf pop_16_employed pop_16_unemployed {; gen `var'=blkgr_`var' * pop_per_blkgr; }; # Households; foreach var in sf3_hh_tot hh_income99_0to19 hh_income99_20to39 hh_income99_40to59 hh_income99_60to74 hh_income99_75to99 hh_income99_100to199 hh_income99_200 {; gen `var'=blkgr_`var' * hh_per_blkgr; };