Using IPUMS USA for Estimation of Population Characteristics in Various Geographic Areas

Spatial Analysis Survey data Census data

Using IPUMS USA data to produce survey-based estimates for geographic levels present in the IPUMS.

Corey S. Sparks, Ph.D. https://github.com/coreysparks
04-14-2021

In this example we will use the IPUMS USA data to produce survey-based estimates for various geographic levels present in the IPUMS. This example uses the 2014-2018 ACS 5-year microdata.

The good folks at IPUMS have created a library to read in their data from the .gz file that you download. Be sure you right click and save the DDI codebook when you create your extract

This will save the xml file that contains all the information on the data (what is contained in the data file) to your computer. When using IPUMS, it will have a name like usa_xxxxx.xml where the x’s represent the extract number (I’m on 92 as of today).

You will also need to download the data file, by right clicking the Download.DAT link in the above image. This will save a .gz file to your computer, again with a name like: usa_xxxxx.dat.gz. Make sure this file and the xml file from above are in the same folder, preferably your class folder.

Be sure the ipumsr package is installed.

library(ipumsr)
ddi <- read_ipums_ddi("~/OneDrive - University of Texas at San Antonio/classes//gis_classwork/usa_00083.xml")
data <- read_ipums_micro(ddi)
Use of data from IPUMS-USA is subject to conditions including that users should
cite the data appropriately. Use command `ipums_conditions()` for more details.
data<-haven::zap_labels(data) #necessary to avoid problems with "labeled" data

Load some other packages

Download geographic data for Public Use Microdata Areas

options(tigris_class = "sf")
pumas<-pumas(state = "TX", year = 2018, cb = T)
plot(pumas["GEOID10"], main = "Public Use Microdata Areas in Texas")

names(data)<-tolower(names(data))

Prepare variables

Here I recode several demographic variables

#weight variables, these have implied decimal places so we have to divde by 100, following the codebook
data$pwt <- data$perwt/100
data$hwt <- data$hhwt/100

#race/ethnicity
data$hisp <- Recode(data$hispan, recodes = "9=NA; 1:4='Hispanic'; 0='NonHispanic'")
data$race_rec <- Recode(data$race, recodes = "1='White'; 2='Black'; 3='Other'; 4:6='Asian'; 7:9='Other'")
data$race_eth <- interaction(data$hisp, data$race_rec, sep = "_")
data$race_eth  <- as.factor(ifelse(substr(as.character(data$race_eth),1,8) == "Hispanic", "Hispanic", as.character(data$race_eth)))
data$race_eth <- relevel(data$race_eth, ref = "NonHispanic_White")

#sex
data$male <- ifelse(data$sex == 1,1,0)

#education
data$educ_level<- Recode(data$educd, recodes = "2:61='0LT_HS';62:64='1_HSD/GED';65:80='2_somecoll';90:100='2_somecoll'; 81:83='3_AssocDegree';101='4_bachelordegree'; 110:116='4_BAplus_GradDegree'; else=NA")

#employment
data$employed <- Recode(data$wrklstwk, recodes = "1=0;2=1; else=NA")

#citizenship
data$cit<-Recode(data$citizen, recodes = "1='US born'; 2='naturalized'; 3:4='notcitizen';else=NA ")

#industry
data$ind_group<-Recode(data$ind, recodes = "170:490='ag_extract'; 770='construction'; 1070:3990='manufac'; 4070:5790='whole_retail'; 6070:6390='trans'; 6470:6780='information'; 6870:7190= 'fire'; 7270=7790='prof/sci/manage'; 7860:8470='edu/social'; 8560:8690='arts'; 8770:9290='other'; 9370:9590='public_adm'; 9670:9870='military'; else=NA ")

data$proftech <- Recode(data$ind, recodes = "7270:7490=1; 0=NA; else=0")

#age in 10 year intervals
data$agecat<-cut(data$age, breaks = c(0, 18, 20, 30, 40, 50, 65, 120), include.lowest = T)

Generate survey design object

Here we identify the person weights and the survey design variables.

des<-svydesign(ids = ~cluster,
               strata = ~ strata,
               weights = ~pwt,
               data = data)

perform survey estimation for PUMAs

The svyby() function allows us calculate estimates for different sub-domains within the data, this could be a demographic characteristic, but we’ll use our geographic level.

puma_est_employ<-svyby(formula = ~employed,
                       by = ~puma,
                       design=subset(des, age %in% 18:65),
                       FUN=svymean,
                       na.rm = TRUE )

puma_est_industry<-svyby(formula = ~proftech,
                         by = ~puma,
                         design = subset(des, employed==1),
                         FUN = svymean,
                         na.rm = TRUE )

head(puma_est_employ)
    puma  employed          se
100  100 0.7198977 0.007330523
200  200 0.7200376 0.009352539
300  300 0.7996959 0.008407888
400  400 0.6952739 0.008796408
501  501 0.7078317 0.012611244
502  502 0.7536922 0.008485558
head(puma_est_industry)
    puma   proftech          se
100  100 0.02736361 0.003251186
200  200 0.03023261 0.003945997
300  300 0.04936567 0.004513747
400  400 0.02275382 0.003263165
501  501 0.04444895 0.005380737
502  502 0.04217603 0.003739477

join to geography

pumas$puma<-as.numeric(pumas$PUMACE10)

geo1<-geo_join(pumas, puma_est_employ, by_sp="puma",by_df= "puma")
head(geo1)
Simple feature collection with 6 features and 12 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -103.044 ymin: 26.15803 xmax: -94.53793 ymax: 36.5007
Geodetic CRS:  NAD83
  STATEFP10 PUMACE10       AFFGEOID10 GEOID10
1        48    02310 7950000US4802310 4802310
2        48    00100 7950000US4800100 4800100
3        48    06807 7950000US4806807 4806807
4        48    04607 7950000US4804607 4804607
5        48    01800 7950000US4801800 4801800
6        48    03900 7950000US4803900 4803900
                                                                            NAME10
1 Dallas (Far North), Carrollton (Southeast) Cities & Addison Town--North of I-635
2       Panhandle Regional Planning Commission (Outside Potter & Randall Counties)
3                                                    Hidalgo County (North & West)
4                      Houston City (Northeast)--Between Loop I-610 & Beltway TX-8
5                        East Texas COG (Southwest)--Henderson & Anderson Counties
6                                       Deep East Texas COG (West) & Walker County
  LSAD10     ALAND10  AWATER10 puma  employed          se rank
1     P0    62059007    384841 2310 0.8255404 0.007484850    1
2     P0 61987076347 283632891  100 0.7198977 0.007330523    1
3     P0  2563115029  18882358 6807 0.5847819 0.012739199    1
4     P0   162865050   1371540 4607 0.6494097 0.011932801    1
5     P0  5015260440 233022050 1800 0.6593252 0.011306024    1
6     P0 11228402879 399650925 3900 0.6385985 0.011127962    1
                        geometry
1 MULTIPOLYGON (((-96.9001 32...
2 MULTIPOLYGON (((-103.044 34...
3 MULTIPOLYGON (((-98.5832 26...
4 MULTIPOLYGON (((-95.33534 2...
5 MULTIPOLYGON (((-96.45414 3...
6 MULTIPOLYGON (((-95.86227 3...
geo2<-geo_join(pumas, puma_est_industry, by_sp="puma",by_df= "puma")
head(geo2)
Simple feature collection with 6 features and 12 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -103.044 ymin: 26.15803 xmax: -94.53793 ymax: 36.5007
Geodetic CRS:  NAD83
  STATEFP10 PUMACE10       AFFGEOID10 GEOID10
1        48    02310 7950000US4802310 4802310
2        48    00100 7950000US4800100 4800100
3        48    06807 7950000US4806807 4806807
4        48    04607 7950000US4804607 4804607
5        48    01800 7950000US4801800 4801800
6        48    03900 7950000US4803900 4803900
                                                                            NAME10
1 Dallas (Far North), Carrollton (Southeast) Cities & Addison Town--North of I-635
2       Panhandle Regional Planning Commission (Outside Potter & Randall Counties)
3                                                    Hidalgo County (North & West)
4                      Houston City (Northeast)--Between Loop I-610 & Beltway TX-8
5                        East Texas COG (Southwest)--Henderson & Anderson Counties
6                                       Deep East Texas COG (West) & Walker County
  LSAD10     ALAND10  AWATER10 puma   proftech          se rank
1     P0    62059007    384841 2310 0.11304911 0.006613145    1
2     P0 61987076347 283632891  100 0.02736361 0.003251186    1
3     P0  2563115029  18882358 6807 0.02075907 0.005035484    1
4     P0   162865050   1371540 4607 0.05018302 0.008432400    1
5     P0  5015260440 233022050 1800 0.03392626 0.005050893    1
6     P0 11228402879 399650925 3900 0.03434501 0.005067912    1
                        geometry
1 MULTIPOLYGON (((-96.9001 32...
2 MULTIPOLYGON (((-103.044 34...
3 MULTIPOLYGON (((-98.5832 26...
4 MULTIPOLYGON (((-95.33534 2...
5 MULTIPOLYGON (((-96.45414 3...
6 MULTIPOLYGON (((-95.86227 3...

Map estimates

Employment rates by PUMA

tmap_mode("plot")

tm_basemap("OpenStreetMap.Mapnik")+
  tm_shape(geo1)+
  tm_polygons("employed",
              style="kmeans",
              n=8,
              legend.hist = TRUE) +
  tm_layout(legend.outside = TRUE,
            title = "Employment rate in Texas PUMAs \n 2014-2018") 

[1] "#FFFDDB" "#FEF1AF" "#FED97B" "#FEB441" "#F68820" "#DB5D0A"
[7] "#AF3E03" "#7A2A05"
tm_basemap("OpenStreetMap.Mapnik")+
  tm_shape(geo2)+
  tm_polygons("proftech",
              style="kmeans",
              n=8,
              legend.hist = TRUE) +
 tm_layout(legend.outside = TRUE,
            title = "Employment rate in Texas PUMAs \n 2014-2018")  

[1] "#FFFDDB" "#FEF1AF" "#FED97B" "#FEB441" "#F68820" "#DB5D0A"
[7] "#AF3E03" "#7A2A05"

Estimation for metro areas

Here we use core based statistical areas instead of PUMAs

mets<-core_based_statistical_areas(cb = T, year = 2018)
mets<-mets[grep(mets$NAME,pattern =  "TX"),]
plot(mets["NAME"])
sts<-states(cb=T, year=2018)
sts<-sts%>%
  filter(GEOID==48)

estimates by metro area

met_est_edu<-svyby(formula = ~educ_level,
                   by = ~met2013,
                   design=subset(des,age>25),
                   FUN=svymean,
                   na.rm=T )

met_est_employ<-svyby(formula = ~employed,
                      by = ~met2013,
                      design=subset(des, age%in%18:65),
                      FUN=svymean,
                      na.rm=T )

met_est_industry<-svyby(formula = ~proftech,
                        by = ~met2013,
                        design=subset(des, employed==1),
                        FUN=svymean,
                        na.rm=T )

head(met_est_edu)
      met2013 educ_level0LT_HS educ_level1_HSD/GED
0           0        0.1734865           0.3199473
11100   11100        0.1474012           0.2498404
12420   12420        0.1044039           0.1943825
13140   13140        0.1402338           0.3429575
15180   15180        0.3342822           0.2588369
17780   17780        0.1318924           0.1901474
      educ_level2_somecoll educ_level3_AssocDegree
0                0.2411852              0.07472963
11100            0.2600133              0.08979227
12420            0.2014018              0.06550880
13140            0.2479652              0.08337222
15180            0.1654062              0.06845496
17780            0.1864515              0.07192515
      educ_level4_bachelordegree educ_level4_BAplus_GradDegree
0                      0.1304826                    0.06016876
11100                  0.1753535                    0.07759931
12420                  0.2778240                    0.15647902
13140                  0.1287706                    0.05670054
15180                  0.1198257                    0.05319409
17780                  0.2208187                    0.19876491
      se.educ_level0LT_HS se.educ_level1_HSD/GED
0             0.001471122            0.001654932
11100         0.005841750            0.006460409
12420         0.002091061            0.002424993
13140         0.004418930            0.005939102
15180         0.006285971            0.005498910
17780         0.007826039            0.007652477
      se.educ_level2_somecoll se.educ_level3_AssocDegree
0                 0.001464641               0.0008796497
11100             0.006192460               0.0039423728
12420             0.002213295               0.0013175601
13140             0.005216824               0.0034449899
15180             0.004549337               0.0032076764
17780             0.007261211               0.0048312880
      se.educ_level4_bachelordegree se.educ_level4_BAplus_GradDegree
0                       0.001143757                     0.0007838818
11100                   0.005399918                     0.0035263553
12420                   0.002428862                     0.0018713949
13140                   0.004003550                     0.0026631508
15180                   0.004242967                     0.0025605276
17780                   0.007644511                     0.0073349589
head(met_est_employ)
      met2013  employed          se
0           0 0.6822992 0.001817833
11100   11100 0.7632274 0.006290738
12420   12420 0.7672150 0.002455673
13140   13140 0.6694852 0.006105194
15180   15180 0.6316548 0.006056171
17780   17780 0.6932124 0.008811923
head(met_est_industry)
      met2013   proftech           se
0           0 0.03182279 0.0007590619
11100   11100 0.04120088 0.0030842756
12420   12420 0.11986775 0.0019675186
13140   13140 0.03952930 0.0028814578
15180   15180 0.02552637 0.0023588194
17780   17780 0.06566332 0.0045709503
mets$met2013<-as.numeric(mets$GEOID)
geo3<-geo_join(mets, met_est_employ,by_sp= "met2013",by_df= "met2013")

Note, grey Metros are ones that are not identified in the ACS

tm_basemap("OpenStreetMap.Mapnik")+
  tm_shape(geo3)+
  tm_polygons("employed",
              style="kmeans",
              n=8,
              legend.hist = TRUE) +
 tm_layout(legend.outside = TRUE,
            title = "Employment rate in Texas Metro Areas \n 2014-2018")  

[1] "#FFFDDB" "#FEF1AF" "#FED97B" "#FEB441" "#F68820" "#DB5D0A"
[7] "#AF3E03" "#7A2A05"

Estimation for Counties

cos<-counties(cb= T,state = "TX", year = 2018)
plot(cos["NAME"])
sts<-states(cb=T, year=2018)
sts<-sts%>%
  filter(GEOID==48)

estimates by county area

cos_est_edu<-svyby(formula = ~educ_level,
                   by = ~countyfip,
                   design=subset(des,age>25),
                   FUN=svymean, na.rm=T )
cos_est_employ<-svyby(formula = ~employed,
                      by = ~countyfip,
                      design=subset(des, age%in%18:65),
                      FUN=svymean, na.rm=T )
cos_est_industry<-svyby(formula = ~proftech,
                        by = ~countyfip,
                        design=subset(des, employed==1),
                        FUN=svymean, na.rm=T )

head(cos_est_edu)
   countyfip educ_level0LT_HS educ_level1_HSD/GED
0          0       0.17846280           0.3276104
27        27       0.09174597           0.2593956
29        29       0.16342280           0.2540599
39        39       0.11680599           0.2405961
41        41       0.13189240           0.1901474
61        61       0.33428223           0.2588369
   educ_level2_somecoll educ_level3_AssocDegree
0             0.2367892              0.07129873
27            0.2905451              0.11235938
29            0.2272522              0.07977870
39            0.2494250              0.08209923
41            0.1864515              0.07192515
61            0.1654062              0.06845496
   educ_level4_bachelordegree educ_level4_BAplus_GradDegree
0                   0.1275334                    0.05830544
27                  0.1583623                    0.08759169
29                  0.1741433                    0.10134311
39                  0.1948811                    0.11619262
41                  0.2208187                    0.19876491
61                  0.1198257                    0.05319409
   se.educ_level0LT_HS se.educ_level1_HSD/GED se.educ_level2_somecoll
0          0.001467295            0.001631614             0.001419003
27         0.004197955            0.006089337             0.006455573
29         0.002328835            0.002664094             0.002362733
39         0.005448384            0.006403249             0.006913765
41         0.007826039            0.007652477             0.007261211
61         0.006285971            0.005498910             0.004549337
   se.educ_level3_AssocDegree se.educ_level4_bachelordegree
0                0.0008436292                   0.001112816
27               0.0042730538                   0.004847906
29               0.0015251664                   0.002065234
39               0.0040208220                   0.006332340
41               0.0048312880                   0.007644511
61               0.0032076764                   0.004242967
   se.educ_level4_BAplus_GradDegree
0                      0.0007589417
27                     0.0036663732
29                     0.0015631539
39                     0.0056966081
41                     0.0073349589
61                     0.0025605276
head(cos_est_employ)
   countyfip  employed          se
0          0 0.6767834 0.001793685
27        27 0.7022383 0.006425394
29        29 0.7115368 0.002715471
39        39 0.7374041 0.006602161
41        41 0.6932124 0.008811923
61        61 0.6316548 0.006056171
head(cos_est_industry)
   countyfip   proftech           se
0          0 0.03365621 0.0007609425
27        27 0.03673016 0.0028576691
29        29 0.06187635 0.0015303390
39        39 0.05968169 0.0041933537
41        41 0.06566332 0.0045709503
61        61 0.02552637 0.0023588194

Again, the ACS doesn’t identify counties in the microdata except for those counties with small populations. The list of identified counties can be found here

cos$cofip<-as.numeric(cos$COUNTYFP)


geo4<-geo_join(cos, cos_est_employ,by_sp= "cofip",by_df= "countyfip")



tm_basemap("OpenStreetMap.Mapnik")+
  tm_shape(geo4)+
  tm_polygons("employed",
              style="kmeans",
              n=8,
              legend.hist = TRUE) +
 tm_layout(legend.outside = TRUE,
            title = "Employment rate in Texas Counties \n 2014-2018")  

[1] "#FFFDDB" "#FEF1AF" "#FED97B" "#FEB441" "#F68820" "#DB5D0A"
[7] "#AF3E03" "#7A2A05"

Citation

For attribution, please cite this work as

Ph.D. (2021, April 14). Corey Sparks's Blog: Using IPUMS USA for Estimation of Population Characteristics in Various Geographic Areas. Retrieved from https://coreysparks.github.io/posts/2021-04-14-mapping-ipums-estimates/

BibTeX citation

@misc{ph.d.2021using,
  author = {Ph.D., Corey S. Sparks,},
  title = {Corey Sparks's Blog: Using IPUMS USA for Estimation of Population Characteristics in Various Geographic Areas},
  url = {https://coreysparks.github.io/posts/2021-04-14-mapping-ipums-estimates/},
  year = {2021}
}