#Load necessary libraries
library(tidyverse)
library(ipumsr)
library(janitor)
library(labelled)
library(fs)
Using ipumsr to load immigrant statistics
The following chunk of code loads the R libraries necessary for this exercise. You may need to install them to run this code.
Create and clean an extract using IPUMSR.
You must set up an IPUMS API key before using the ipumsr package. For instructions on how to set up the IPUMS API, see “Introduction to the IPUMS API for R Users.”
For guidance on how to define an extract, see “Microdata API Requests.” You can view the list available IPUMS ACS samples and their IDs on the IPUMS sample IDs page.
Note: You only need to run define_extract_micro()
once, unless your extract parameters (e.g., years or variables) change. This command triggers an IPUMS API call, which may take several minutes depending on the extract size. If running this as part of a larger script, comment out this command to avoid repeated downloads.
# Load samples
<- ipumsr::get_sample_info('usa')
acs_samps
# Create a vector of sample IDs to load
<- c('us2000a', 'us2001a', 'us2002a', 'us2003a', 'us2004a', 'us2005a',
years 'us2006a', 'us2007a', 'us2008a', 'us2009a', 'us2010a', 'us2011a',
'us2012a', 'us2013a', 'us2014a', 'us2015a', 'us2016a', 'us2017a',
'us2018a', 'us2019a', 'us2020a', 'us2021a', 'us2022a', 'us2023a')
<- define_extract_micro(
acs_extr "usa",
description = 'ACS extract for Immigration statistics',
samples = years,
# Select the variables to load
variables = list('STATEFIP','COUNTYFIP', 'SEX', 'AGE', 'RACE', 'HISPAN',
'BPL', 'CITIZEN', 'YRNATUR', 'YRIMMIG', 'YRSUSA1',
'LANGUAGE', 'EMPSTAT', 'LABFORCE', 'OCC', 'IND')) |>
submit_extract() |>
wait_for_extract()
# Download extract to input folder
<- download_extract(extract = acs_extr,
dl_extr download_dir = 'input/',
overwrite = TRUE)
Load the extract (the xml file) and clean it up before conducting analysis.
Note: Your extract will likely have a different file name, double-check this and update the script accordingly before running the following chunk.
# NOTE: Your project directory and xml file may look different!
<- read_ipums_micro(ddi = 'input/usa_00010.xml')
acs_raw
<- acs_raw |>
acs # Use the janitor library to clean up names
::clean_names() |>
janitor# Use labelled library to create custom value labels
# relabel citizen=0 to "Not foreign born" per https://usa.ipums.org/usa-action/variables/CITIZEN#comparability_section
::set_value_labels(citizen = c('Not foreign born'=0, 'Born abroad of American parents'=1, 'Naturalized citizen'=2, 'Not a citizen'=3)) |>
labelledmutate(nativity = case_when(citizen %in% c(0,1) ~ 1,
%in% c(2,3) ~ 2)) |>
citizen add_value_labels(nativity = c('Native' = 1, 'Foreign-born'=2))
Benchmark your data
Run a US population benchmark using the Census ACS table statistics to check your data before continuing.
# Do your US population estimates benchmark with the Census ACS table statistics?
# https://data.census.gov/table/ACSDP1Y2023.DP05?q=DP05:+ACS+Demographic+and+Housing+Estimates
<- acs |>
us_pop summarize(pop = sum(perwt, na.rm=TRUE),
.by=year)
us_pop
# A tibble: 24 × 2
year pop
<int> <dbl>
1 2000 281421906
2 2001 277075792
3 2002 280717370
4 2003 283051379
5 2004 285674993
6 2005 288398819
7 2006 299398485
8 2007 301621159
9 2008 304059728
10 2009 307006556
# ℹ 14 more rows
Run your analysis!
Here you will run your analysis to find three statistics:
- Population by citizenship status and year
- Population count and share by nativity and year
- Employment counts and shares of immigrant workers by industry (See a list of industry codes and their associated titles here)
Don’t forget to update the code to match your selection of years.
This code can also be easily altered to filter for specific groups. For example, you can filter by state, specific industry, or for prime-age workers. See the commented-out commands for examples. Be sure to check for viable sample sizes when using a smaller data set.
# Population by citizenship status and year 2000–2023
<- acs |>
foreign_born_total ## filter to just North Carolina
# filter(statefip == 37) |>
mutate(citizen = to_factor(citizen)) |>
summarize(pop = sum(perwt, na.rm=TRUE),
.by=c(year, citizen)) |>
pivot_wider(id_cols = year, names_from = citizen, values_from = pop)
foreign_born_total
# A tibble: 24 × 5
year `Not foreign born` `Not a citizen` `Naturalized citizen`
<int> <dbl> <dbl> <dbl>
1 2000 248371297 18599549 12533932
2 2001 243578016 18804708 12743420
3 2002 245639288 19565399 13530751
4 2003 247351604 19749916 13917762
5 2004 249429239 19857656 14400045
6 2005 250569155 20836032 14933571
7 2006 259759423 21696303 15773084
8 2007 261446898 21843559 16204897
9 2008 263619759 21685745 16330357
10 2009 266132797 21640993 16811829
# ℹ 14 more rows
# ℹ 1 more variable: `Born abroad of American parents` <dbl>
# Population by nativity and year 2000–2023
<- acs |>
nativity mutate(nativity = to_factor(nativity)) |>
summarize(pop = sum(perwt, na.rm=TRUE),
.by=c(year, nativity)) |>
mutate(share = pop/sum(pop), .by=year) |>
pivot_wider(id_cols = year, names_from = nativity, values_from = c(pop, share))
nativity
# A tibble: 24 × 5
year pop_Native `pop_Foreign-born` share_Native `share_Foreign-born`
<int> <dbl> <dbl> <dbl> <dbl>
1 2000 250288425 31133481 0.889 0.111
2 2001 245527664 31548128 0.886 0.114
3 2002 247621220 33096150 0.882 0.118
4 2003 249383701 33667678 0.881 0.119
5 2004 251417292 34257701 0.880 0.120
6 2005 252629216 35769603 0.876 0.124
7 2006 261929098 37469387 0.875 0.125
8 2007 263572703 38048456 0.874 0.126
9 2008 266043626 38016102 0.875 0.125
10 2009 268553734 38452822 0.875 0.125
# ℹ 14 more rows
# Industries and occupations of immigrant workers
# This analysis pools 5 years of data
<- acs |>
nativity_ind filter(year %in% c(2019:2023), age>=16, empstat==1) |>
## filter for prime-age EPOP
# filter(age >= 25 & age <= 54) |>
mutate(nativity = to_factor(nativity)) |>
# Adjust perwt, dividing it by 5.
summarize(total_emp = sum(empstat * perwt/5, na.rm=TRUE),
n=n(),
.by=c(nativity, ind)) |>
mutate(share = total_emp/sum(total_emp), .by=nativity) |>
pivot_wider(id_cols = ind, names_from = nativity, values_from = c(total_emp, share, n))
nativity_ind
# A tibble: 302 × 7
ind total_emp_Native `total_emp_Foreign-born` share_Native
<int+lbl> <dbl> <dbl> <dbl>
1 9160 1018174. 128856. 0.00767
2 5170 682919 139648. 0.00514
3 7870 3681723. 803696. 0.0277
4 5570 100653 15650. 0.000758
5 6480 135431 17229. 0.00102
6 3291 667545 118284 0.00503
7 8680 7329304. 1796732. 0.0552
8 9670 401908 34028. 0.00303
9 5391 1614827. 217887. 0.0122
10 5490 176175 27847 0.00133
# ℹ 292 more rows
# ℹ 3 more variables: `share_Foreign-born` <dbl>, n_Native <int>,
# `n_Foreign-born` <int>
# See a list of industry codes and their associated titles here: https://usa.ipums.org/usa/volii/ind2022.shtml
Happy coding!