Source code for pyutils.geocode

#!/usr/bin/env python3

# © Copyright 2022, Scott Gasch

"""Wrapper around US Census address geocoder API described here:

* https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf
* https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf

Also try::

    $ curl --form [email protected] \\
           --form benchmark=2020 \\
           https://geocoding.geo.census.gov/geocoder/locations/addressbatch \\
           --output geocoderesult.csv
"""

import functools
import json
import logging
from typing import Any, Dict, List, Optional

import requests
from requests.utils import requote_uri

from pyutils import list_utils

logger = logging.getLogger(__name__)


[docs] @functools.lru_cache(maxsize=256) def geocode_address(address: str) -> Optional[Dict[str, Any]]: """Send a single address to the US Census geocoding API in order to lookup relevant data about it (including, if possible, its lat/long). The response is a parsed JSON chunk of data with N addressMatches in the result section and the details of each match within it. Args: address: the full address to lookup in the form: "STREET ADDRESS, CITY, STATE, ZIPCODE". These components may be omitted and the service will make educated guesses but the commas delimiting each component must be included. Returns: A parsed json dict with a bunch of information about the address contained within it. Each 'addressMatch' in the JSON describes the details of a possible match. Returns None if there was an error or the address is not known. >>> json = geocode_address('4600 Silver Hill Rd,, 20233') >>> json['result']['addressMatches'][0]['matchedAddress'] '4600 SILVER HILL RD, WASHINGTON, DC, 20233' >>> json['result']['addressMatches'][0]['coordinates'] {'x': -76.92748724230096, 'y': 38.84601622386617} """ url = "https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress" url += f"?address={address}" url += "&returntype=geographies&layers=all&benchmark=4&vintage=4&format=json" url = requote_uri(url) logger.debug("GET: %s", url) try: r = requests.get(url, timeout=10.0) except Exception as e: logger.exception(e) return None if not r.ok: logger.debug(r.text) logger.error("Unexpected response code %d, wanted 200. Fail.", r.status_code) return None logger.debug("Response: %s", json.dumps(r.json(), indent=4, sort_keys=True)) return r.json()
[docs] def batch_geocode_addresses(addresses: List[str]) -> Optional[List[str]]: """Send a list of addresses for batch geocoding to a web service operated by the US Census Bureau. Args: addresses: a list of addresses to geocode. Each line of the input list should be a single address in the form: "STREET ADDRESS, CITY, STATE, ZIPCODE". Individual address components may be omitted and the service will make educated guesses but the commas delimiters between address components may not be omitted. Returns: An array of the same size as the input array with one answer record per line. Returns None on error. Note: this code will deal with requests >10k addresses by chunking them internally because the census website disallows requests > 10k lines. >>> batch_geocode_addresses( ... [ ... '4600 Silver Hill Rd, Washington, DC, 20233', ... '935 Pennsylvania Avenue, NW, Washington, DC, 20535-0001', ... '1600 Pennsylvania Avenue NW, Washington, DC, 20500', ... '700 Pennsylvania Avenue NW, Washington, DC, 20408', ... ] ... ) ['"1"," 4600 Silver Hill Rd, Washington, DC, 20233","Match","Exact","4600 SILVER HILL RD, WASHINGTON, DC, 20233","-76.92748724230091,38.84601622386623","76355984","L","24","033","802405","2004"', '"2"," 935 Pennsylvania Avenue, NW, Washington, DC","No_Match"', '"3"," 1600 Pennsylvania Avenue NW, Washington, DC, 20500","Match","Exact","1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500","-77.03654395730786,38.89869091865552","76225813","L","11","001","980000","1034"', '"4"," 700 Pennsylvania Avenue NW, Washington, DC, 20408","Match","Exact","700 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20408","-77.02305485155983,38.89356561956657","76226346","L","11","001","980000","1025"'] """ n = 1 url = "https://geocoding.geo.census.gov/geocoder/geographies/addressbatch" payload = {"benchmark": "4", "vintage": "4"} out = [] for chunk in list_utils.shard(addresses, 9999): raw_file = "" for address in chunk: raw_file += f"{n}, {address}\n" n += 1 files = {"addressFile": ("input.csv", raw_file)} logger.debug("POST: %s", url) try: r = requests.post(url, files=files, data=payload, timeout=10.0) except Exception as e: logger.exception(e) return None if not r.ok: logger.debug(r.text) logger.error( "Unexpected response code %d, wanted 200. Fail.", r.status_code ) return None logger.debug("Response: %s", r.text) for line in r.text.split("\n"): line = line.strip() if line: out.append(line) return out
if __name__ == "__main__": import doctest doctest.testmod()