Using ipumsr to load immigrant statistics

Author

Daniel Perez and Emma Cohn

The following chunk of code loads the R libraries necessary for this exercise. You may need to install them to run this code.

#Load necessary libraries
library(tidyverse)
library(ipumsr)
library(janitor)
library(labelled)
library(fs)

Create and clean an extract using IPUMSR.

You must set up an IPUMS API key before using the ipumsr package. For instructions on how to set up the IPUMS API, see “Introduction to the IPUMS API for R Users.”

For guidance on how to define an extract, see “Microdata API Requests.” You can view the list available IPUMS ACS samples and their IDs on the IPUMS sample IDs page.

Note: You only need to run define_extract_micro() once, unless your extract parameters (e.g., years or variables) change. This command triggers an IPUMS API call, which may take several minutes depending on the extract size. If running this as part of a larger script, comment out this command to avoid repeated downloads.

# Load samples
acs_samps <- ipumsr::get_sample_info('usa')

# Create a vector of sample IDs to load
years <- c('us2000a', 'us2001a', 'us2002a', 'us2003a', 'us2004a', 'us2005a',
           'us2006a', 'us2007a', 'us2008a', 'us2009a', 'us2010a', 'us2011a',
           'us2012a', 'us2013a', 'us2014a', 'us2015a', 'us2016a', 'us2017a',
           'us2018a', 'us2019a', 'us2020a', 'us2021a', 'us2022a', 'us2023a')

acs_extr <- define_extract_micro(
  "usa",
  description = 'ACS extract for Immigration statistics',
  samples = years,
  # Select the variables to load
  variables = list('STATEFIP','COUNTYFIP', 'SEX', 'AGE', 'RACE', 'HISPAN',
                   'BPL', 'CITIZEN', 'YRNATUR', 'YRIMMIG', 'YRSUSA1',
                   'LANGUAGE', 'EMPSTAT', 'LABFORCE', 'OCC', 'IND')) |> 
  submit_extract() |> 
  wait_for_extract()

# Download extract to input folder
dl_extr <- download_extract(extract = acs_extr,
                                      download_dir = 'input/',
                                      overwrite = TRUE)

Load the extract (the xml file) and clean it up before conducting analysis.

Note: Your extract will likely have a different file name, double-check this and update the script accordingly before running the following chunk.

# NOTE: Your project directory and xml file may look different!
acs_raw <- read_ipums_micro(ddi = 'input/usa_00010.xml')

acs <- acs_raw |> 
  # Use the janitor library to clean up names
  janitor::clean_names() |> 
  # Use labelled library to create custom value labels
  # relabel citizen=0 to "Not foreign born" per https://usa.ipums.org/usa-action/variables/CITIZEN#comparability_section
  labelled::set_value_labels(citizen = c('Not foreign born'=0, 'Born abroad of American parents'=1, 'Naturalized citizen'=2, 'Not a citizen'=3)) |> 
  mutate(nativity = case_when(citizen %in% c(0,1) ~ 1,
                              citizen %in% c(2,3) ~ 2)) |> 
  add_value_labels(nativity = c('Native' = 1, 'Foreign-born'=2))

Benchmark your data

Run a US population benchmark using the Census ACS table statistics to check your data before continuing.

# Do your US population estimates benchmark with the Census ACS table statistics?
#   https://data.census.gov/table/ACSDP1Y2023.DP05?q=DP05:+ACS+Demographic+and+Housing+Estimates

us_pop <- acs |> 
summarize(pop = sum(perwt, na.rm=TRUE),
          .by=year)

us_pop

# A tibble: 24 × 2
    year       pop
   <int>     <dbl>
 1  2000 281421906
 2  2001 277075792
 3  2002 280717370
 4  2003 283051379
 5  2004 285674993
 6  2005 288398819
 7  2006 299398485
 8  2007 301621159
 9  2008 304059728
10  2009 307006556
# ℹ 14 more rows

Run your analysis!

Here you will run your analysis to find three statistics:

Population by citizenship status and year
Population count and share by nativity and year
Employment counts and shares of immigrant workers by industry (See a list of industry codes and their associated titles here)

Don’t forget to update the code to match your selection of years.

This code can also be easily altered to filter for specific groups. For example, you can filter by state, specific industry, or for prime-age workers. See the commented-out commands for examples. Be sure to check for viable sample sizes when using a smaller data set.

# Population by citizenship status and year 2000–2023
foreign_born_total <- acs |> 
  ## filter to just North Carolina
  # filter(statefip == 37) |>
  mutate(citizen = to_factor(citizen)) |> 
  summarize(pop = sum(perwt, na.rm=TRUE),
            .by=c(year, citizen)) |> 
  pivot_wider(id_cols = year, names_from = citizen, values_from = pop)

foreign_born_total

# A tibble: 24 × 5
    year `Not foreign born` `Not a citizen` `Naturalized citizen`
   <int>              <dbl>           <dbl>                 <dbl>
 1  2000          248371297        18599549              12533932
 2  2001          243578016        18804708              12743420
 3  2002          245639288        19565399              13530751
 4  2003          247351604        19749916              13917762
 5  2004          249429239        19857656              14400045
 6  2005          250569155        20836032              14933571
 7  2006          259759423        21696303              15773084
 8  2007          261446898        21843559              16204897
 9  2008          263619759        21685745              16330357
10  2009          266132797        21640993              16811829
# ℹ 14 more rows
# ℹ 1 more variable: `Born abroad of American parents` <dbl>

# Population by nativity and year 2000–2023
nativity <- acs |> 
  mutate(nativity = to_factor(nativity)) |> 
  summarize(pop = sum(perwt, na.rm=TRUE),
            .by=c(year, nativity)) |>
  mutate(share = pop/sum(pop), .by=year) |> 
  pivot_wider(id_cols = year, names_from = nativity, values_from = c(pop, share))

nativity

# A tibble: 24 × 5
    year pop_Native `pop_Foreign-born` share_Native `share_Foreign-born`
   <int>      <dbl>              <dbl>        <dbl>                <dbl>
 1  2000  250288425           31133481        0.889                0.111
 2  2001  245527664           31548128        0.886                0.114
 3  2002  247621220           33096150        0.882                0.118
 4  2003  249383701           33667678        0.881                0.119
 5  2004  251417292           34257701        0.880                0.120
 6  2005  252629216           35769603        0.876                0.124
 7  2006  261929098           37469387        0.875                0.125
 8  2007  263572703           38048456        0.874                0.126
 9  2008  266043626           38016102        0.875                0.125
10  2009  268553734           38452822        0.875                0.125
# ℹ 14 more rows

# Industries and occupations of immigrant workers
# This analysis pools 5 years of data
nativity_ind <- acs |> 
  filter(year %in% c(2019:2023), age>=16, empstat==1) |> 
  ## filter for prime-age EPOP
  # filter(age >= 25 & age <= 54) |>
  mutate(nativity = to_factor(nativity)) |> 
  # Adjust perwt, dividing it by 5.
  summarize(total_emp = sum(empstat * perwt/5, na.rm=TRUE),
            n=n(),
            .by=c(nativity, ind)) |>
  mutate(share = total_emp/sum(total_emp), .by=nativity) |> 
  pivot_wider(id_cols = ind, names_from = nativity, values_from = c(total_emp, share, n))

nativity_ind

# A tibble: 302 × 7
   ind       total_emp_Native `total_emp_Foreign-born` share_Native
   <int+lbl>            <dbl>                    <dbl>        <dbl>
 1 9160              1018174.                  128856.     0.00767 
 2 5170               682919                   139648.     0.00514 
 3 7870              3681723.                  803696.     0.0277  
 4 5570               100653                    15650.     0.000758
 5 6480               135431                    17229.     0.00102 
 6 3291               667545                   118284      0.00503 
 7 8680              7329304.                 1796732.     0.0552  
 8 9670               401908                    34028.     0.00303 
 9 5391              1614827.                  217887.     0.0122  
10 5490               176175                    27847      0.00133 
# ℹ 292 more rows
# ℹ 3 more variables: `share_Foreign-born` <dbl>, n_Native <int>,
#   `n_Foreign-born` <int>

# See a list of industry codes and their associated titles here: https://usa.ipums.org/usa/volii/ind2022.shtml

Happy coding!