eiCompare: Geocoding

In this vignette, we will walk-through how to geocode a dataset that includes addresses used to apply the BISG method for estimating the race/ethnicity of registered voters.

What is Geocoding?

One of the first steps to performing ecological inference using eiCompare is geocoding your voter file addresses in order to perform Bayesian Improved Surname Geocoding (BISG). Geocoding is the process of using an address or place of location to find geographic coordinates (i.e. latitude/longitude) of that location on a map. In relation to performing BISG, the values of the geographic coordinates are compared to other census data containing self-reported race and ethnicity to determine the likelihood of an individual living in an ecological unit area (i.e. county, block, tract) being of a certain race given their address. This probability is then used to update a prior probability in the BISG analysis. For more information on BISG, please refer to the BISG vignette. Below are some steps to help you walk through the process of performing geocoding on your voter file.

Step 1: Load R libraries/packages

Each library/package loaded allows you to use certain functions needed to prep your data for geocoding and run the geocoding tool(s).

library(eiCompare)
library(stringr)
library(plyr)
library(dplyr)
library(tidyr)
library(foreach)
library(parallel)
library(doParallel)
library(data.table)
library(censusxy)
library(sf)
library(leaflet)

Step 2: Load your voter data.

We are using a toy dataset representing the Georgia and Fulton county voter registration and geocoding all voter addresses.

# Create toy dataset for Fulton and Gwinnett counties in Georgia
county_code <- c(rep(60, 10), rep(67, 10))
county_name <- c(rep("Fulton", 10), rep("Gwinnett", 10))
registration_number <- c(1:20)
voter_status <- c(rep("A", 20))

last_name <- c(
  "LOCKLER", "RADLEY", "BOORSE", "DEL RAY", "MUSHARBASH", "HILLEBRANDT", "HELME",
  "GILBRAITH", "RUKA", "JUBINVILLE", "HE", "MAZ", "GAULE", "BOETTICHER", "MCMELLEN",
  "RIDEOUT", "WASHINGTON", "KULENOVIC", "HERNANDEZ", "LONG"
)
first_name <- c(
  "GABRIELLA", "OLIVIA", "KEISHA", "ALEX", "NADIA", "LAILA", "ELSON", "JOY",
  "MATTHEW", "KENNEDY", "JOSE", "SAVANNAH", "NATASHIA", "SEAN", "ISMAEL",
  "LUQMAN", "BRYN", "EVELYN", "SAMANTHA", "BESSIE"
)

str_num <- c(
  "1084", "7305", "6200", "6500", "1073", "100", "125", "6425", "6850", "900",
  "287", "1359", "2961", "1525", "4305", "3530", "1405", "4115", "3465", "3655"
)
str_name <- c(
  "Howell Mill Rd NW", "Village Center Blvd", "Bakers Ferry Rd SW", "Aria Blvd",
  "Jameson Pass", "Hollywood Rd NW", "Autumn Ridge Trl", "Hammond Dr NE",
  "Oakley Rd", "Peachtree Dunwoody Rd", "E Crogan St", "Beaver Ruin Road",
  "Lenora Church Rd", "Station Center Blvd", "Paxton Ln", "Parkwood Hills Ct",
  "Beaver Ruin Rd", "S Lee St", "Duluth Highway 120", "Peachtree Industrial Blvd"
)
str_suffix <- c(
  "NW", NA, "SW", NA, NA, "NW", NA, "NE", NA, NA,
  NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)
city <- c(
  "Atlanta", "Fairburn", "Atlanta", "Sandy Springs", "Atlanta", "Roswell",
  "Sandy Springs", "Union City", "Sandy Springs", "Alpharetta", "Lawrenceville",
  "Norcross", "Snellville", "Suwanee", "Lilburn", "Snellville", "Norcross",
  "Buford", "Duluth", "Duluth"
)
state <- "GA"
zipcode <- c(
  "30318", "30213", "30331", "30328", "30318", "30076", "30328",
  "30291", "30328", "30022", "30045", "30093", "30078", "30024",
  "30047", "30078", "30093", "30518", "30096", "30096"
)


voter_data <- data.frame(
  county_code, county_name, registration_number, voter_status, last_name, first_name,
  str_num, str_name, str_suffix, city, state, zipcode
)

Check the voter registration file, voter_data, to make sure the dataset has properly downloaded.

# Check first 6 rows
head(voter_data, 6)
#>   county_code county_name registration_number voter_status   last_name
#> 1          60      Fulton                   1            A     LOCKLER
#> 2          60      Fulton                   2            A      RADLEY
#> 3          60      Fulton                   3            A      BOORSE
#> 4          60      Fulton                   4            A     DEL RAY
#> 5          60      Fulton                   5            A  MUSHARBASH
#> 6          60      Fulton                   6            A HILLEBRANDT
#>   first_name str_num            str_name str_suffix          city state zipcode
#> 1  GABRIELLA    1084   Howell Mill Rd NW         NW       Atlanta    GA   30318
#> 2     OLIVIA    7305 Village Center Blvd       <NA>      Fairburn    GA   30213
#> 3     KEISHA    6200  Bakers Ferry Rd SW         SW       Atlanta    GA   30331
#> 4       ALEX    6500           Aria Blvd       <NA> Sandy Springs    GA   30328
#> 5      NADIA    1073        Jameson Pass       <NA>       Atlanta    GA   30318
#> 6      LAILA     100     Hollywood Rd NW         NW       Roswell    GA   30076

# Check the column names of the file
names(voter_data)
#>  [1] "county_code"         "county_name"         "registration_number"
#>  [4] "voter_status"        "last_name"           "first_name"         
#>  [7] "str_num"             "str_name"            "str_suffix"         
#> [10] "city"                "state"               "zipcode"

# Check dimensions of the dataset
dim(voter_data)

Prepare/Structure your voter data for geocoding.

# Concatenate columns for street address.
voter_data <- concat_streetname(
  voter_file = voter_data,
  street_number = "str_num",
  street_name = "str_name",
  street_suffix = "str_suffix"
)

Create a column for the final address in the voter file.

# Create a final address. This address can also be used if you are geocoding with the Opencage API.

voter_data <- concat_final_address(
  voter_file = voter_data,
  street_address = "street_address",
  city = "city",
  state = "state",
  zipcode = "zipcode"
)

# convert dataframe into a tibble
voter_data <- as_tibble(voter_data)

Step 3: run_geocoder() Function

Select a geocoder and run the geocoder on the addresses in your file.

Select the geocoder you are going to use to find the geographies like coordinates (i.e. latitude and longitude) and FIPS codes for the addresses in the voter file. There are several options for geocoding your data using a geocoding API. The eiCompare package utilizes the US Census Geocding API via a R package called censusxy. For an alternative commerically available geocoder, we recommend using Opencage Geocoder API which has limits of 2500 requests per day.

Note: You have to have at least 4 CPU cores to use the parallel processing option. If you have more than 10,000 voters in your file, we recommend using parallel processing. More information on parallel processing can be found on the Parallel Processing vignette.

Let’s start geocoding our data

We recommend first geocoding your data with the US Census Geocoder API via the R package, censusxy.

The US Census Geocoder has two options for geocoding output: “simple” and “full”.

“simple” returns coordinates of latitude (lat) and longitude.
“full” returns coordinates, and other variables for geographies from Federal Information Processing Standards (FIPS) codes.

To get the latitude and longitude only, we will set the census_output variable to “simple” and assign each census variable to a desired value.

# # Getting the latitude and longitude coordinates only.
# geocoded_data_simple <- run_geocoder(
#   voter_file = voter_data,
#   geocoder = "census",
#   parallel = FALSE,
#   voter_id = "registration_number",
#   street = "street_address",
#   city = "city",
#   state = "state",
#   zipcode = "zipcode",
#   country = "US",
#   census_return = "locations",
#   census_benchmark = "Public_AR_Current",
#   census_output = "simple",
#   census_class = "sf",
#   census_vintage = 4,
#   opencage_key = NULL
# )

Check the column names of the geocoded dataset. There should be an additional column called geometry with latitude, and longitude coordinates.

# colnames(geocoded_data_simple)

Next, we will use parallel processing to make our geocoder run faster by setting parallel=TRUE and obtain simple geographies by setting census_output=“simple”.

# # Getting the latitude and longitude coordinates only.
# geocoded_data_simple_para <- run_geocoder(
#   voter_file = voter_data,
#   geocoder = "census",
#   parallel = TRUE,
#   voter_id = "registration_number",
#   street = "street_address",
#   city = "city",
#   state = "state",
#   zipcode = "zipcode",
#   country = "US",
#   census_return = "locations",
#   census_benchmark = "Public_AR_Current",
#   census_output = "simple",
#   census_class = "sf",
#   census_vintage = 4,
#   opencage_key = NULL
# )

The voter file now includes latitude and longitude points.

# head(geocoded_data_simple_para)

Using parallel processing, we geocoded addresses much faster, even though the difference is seconds apart. We will now demonstrate how to add the the coordinates and FIPS codes by setting the census_output variable to “full”.

# geocoded_data_full_geo <- run_geocoder(
#   voter_file = voter_data,
#   geocoder = "census",
#   parallel = TRUE,
#   voter_id = "registration_number",
#   street = "street_address",
#   city = "city",
#   state = "state",
#   zipcode = "zipcode",
#   country = "US",
#   census_return = "geographies",
#   census_benchmark = "Public_AR_Current",
#   census_output = "full",
#   census_class = "sf",
#   census_vintage = 4,
#   opencage_key = NULL
# )

The geocoded_data_full_geo should be an additional column called geometry with latitude, longitude coordinates, and other variables for geographies.

## Check the first six rows of the geocoded_dataset object
# head(geocoded_data_full_geo)

If there are any missing geocoded addresses, use the run_geocoder() function to re-run the geocoder on those missing geocoded addresses. We will use the geocoded_data_full_geo data to demonstrate how to re-run the geocoder on missing addresses.

# The number of rows missing in new dataframe
# num_miss_geo <- nrow(voter_data) - nrow(geocoded_data_full_geo)
#
# # Only re-run the geocoder if missing data is present.
# if (num_miss_geo > 0) {
#
#   # Find non-geocoded data
#   missing_lonlat_df <- dplyr::anti_join(voter_data, as.data.frame(geocoded_data_full_geo))
#
#   # Run the geocoder on the missing data
#   rerun_data <- run_geocoder(
#     voter_file = missing_lonlat_df,
#     geocoder = "census",
#     parallel = TRUE,
#     voter_id = "registration_number",
#     street = "street_address",
#     city = "city",
#     state = "state",
#     zipcode = "zipcode",
#     country = "US",
#     census_return = "geographies",
#     census_benchmark = "Public_AR_Current",
#     census_output = "full",
#     census_class = "sf",
#     census_vintage = 4,
#     opencage_key = NULL
#   )
# }

Some of the missing addresses were able to be geocoded. Next, we will combine the newly geocoded data from the rerun_data object and the original geocoded data, geocoded_data, object.

# geo_combined <- rbind(geocoded_data_full_geo, rerun_data)

## Check the dimensions of the combined geocoded voter registration dataset
# dim(geo_combined)

# # rename columns to US Census FIPS code variable names
# names(geo_combined)[names(geo_combined) == "cxy_state_id"] <- "STATEFP10"
# names(geo_combined)[names(geo_combined) == "cxy_county_id"] <- "COUNTYFP10"
# names(geo_combined)[names(geo_combined) == "cxy_tract_id"] <- "TRACTCE10"
# names(geo_combined)[names(geo_combined) == "cxy_block_id"] <- "BLOCKCE10"

# geo_combined_df <- as.data.frame(geo_combined)

Step 4: Plot your geocoded data

We will map the area or ecological unit we are interested in using the tigris package for loading in US Census shapefiles.

# # Load shapefile for the state of Georgia using tigris package
# shape_file <- counties(state = "GA")
#
# # Concatenate the state and county codes into column called fips
# shape_file$fips <- paste0(shape_file$STATEFP, shape_file$COUNTYFP)
#
# # Filter shape_file for the counties: Gwinnett and Fulton using the tigris package
# shape_file <- shape_file[shape_file$fips == "13121" | shape_file$fips == "13135", ]
# shape_file$fulton <- ifelse(shape_file$fips == "13121", 1, 0)
# shape_file$gwinnett <- ifelse(shape_file$fips == "13121", 1, 0)

# # Map shape_file
# county_shape <- map_shape_file(
#   shape_file = shape_file,
#   crs = "+proj=latlong +ellps=GRS80 +no_defs",
#   title = "Gwinnett and Fulton counties"
# )
#
# county_shape

We now will look at the block level of Fulton and Gwinnett county.

# #, warning=FALSE, message=FALSE, results="hide"}
# # Load shape file using tidycensus
# gwin_fulton_blocks <- blocks(state = "GA", county = c("Gwinnett", "Fulton"))
#
# # Concatenate the state and county codes into column called fips
# gwin_fulton_blocks$fips <- paste0(gwin_fulton_blocks$STATEFP, gwin_fulton_blocks$COUNTYFP)

# #, warning=FALSE, message=FALSE, results="hide"}
# gwin_fulton_map <- map_shape_points(
#   voter_file = geo_combined_df,
#   shape_file = gwin_fulton_blocks,
#   crs = "+proj=longlat +ellps=GRS80",
#   title = "Gwinnett and Fulton Counties - All Registered Voters"
# )
#
# gwin_fulton_map

# counties_map_inter <- map_interactive(voter_file=geo_combined_df,
#                                       voter_id = "registration_number",
#                                       f_name = "first_name",
#                                       l_name = "last_name",
#                                       fips_code = "county_code",
#                                       latitude = "lat",
#                                       longitude = "lon")
#
# counties_map_inter