# © Copyright 2022, Scott Gasch
"""Wrapper around US Census address geocoder API described here:
-https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf"""
+https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf
+https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf
+"""
import logging
-import re
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
import requests
-from bs4 import BeautifulSoup
from requests.utils import requote_uri
-import string_utils
-
logger = logging.getLogger(__name__)
def geocode_address(address: str) -> Optional[Dict[str, Any]]:
- """Send a single address to the US Census geocoding API.
+ """Send a single address to the US Census geocoding API. The response
+ is a parsed JSON chunk of data with N addressMatches in the result
+ section and the details of each match within it. Returns None on error.
- >>> out = geocode_address('4600 Silver Hill Rd,, 20233')
- >>> out['Matched Address']
+ >>> json = geocode_address('4600 Silver Hill Rd,, 20233')
+ >>> json['result']['addressMatches'][0]['matchedAddress']
'4600 SILVER HILL RD, WASHINGTON, DC, 20233'
- >>> out['Interpolated Longitude (X) Coordinates']
- -76.92743
- >>> out['Interpolated Latitude (Y) Coordinates']
- 38.84599
+
+ >>> json['result']['addressMatches'][0]['coordinates']
+ {'x': -76.92743, 'y': 38.84599}
"""
url = 'https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress'
url += f'?address={address}'
- url += '&layers=all&benchmark=4&vintage=4'
+ url += '&returntype=geographies&layers=all&benchmark=4&vintage=4&format=json'
url = requote_uri(url)
logger.debug('GET: %s', url)
try:
if r.status_code != 200:
logger.error('Unexpected response code %d, wanted 200. Fail.', r.status_code)
return None
- else:
- soup = BeautifulSoup(r.text, 'html.parser')
- result = soup.find('div', id='pl_gov_census_geo_geocoder_domain_AddressResult')
- logger.debug('Unhelpful result blurb: "%s"', result)
- output = result.get_text('\n')
- label = None
- out = {}
- for line in output.split('\n'):
- if re.match(r'.*: *$', line):
- line = line.strip()
- label = line[:-1]
- logger.debug('Label is: "%s"', label)
- else:
- if label:
- value = line.strip()
- if string_utils.is_integer_number(value):
- value = int(value)
- elif string_utils.is_number(value):
- value = float(value)
- logger.debug('Value is: "%s"', value)
- out[label] = value
+ # print(json.dumps(r.json(), indent=4, sort_keys=True))
+ return r.json()
+
+
+def batch_geocode_addresses(addresses: List[str]):
+ """Send up to addresses for batch geocoding. Each line of the input
+ list should be a single address of the form: STREET ADDRESS, CITY,
+ STATE, ZIP. Components may be omitted but the commas may not be.
+ Result is an array of the same size as the input array with one
+ answer record per line. Returns None on error.
+
+ >>> batch_geocode_addresses(
+ ... [
+ ... '4600 Silver Hill Rd, Washington, DC, 20233',
+ ... '935 Pennsylvania Avenue, NW, Washington, DC, 20535-0001',
+ ... '1600 Pennsylvania Avenue NW, Washington, DC, 20500',
+ ... '700 Pennsylvania Avenue NW, Washington, DC, 20408',
+ ... ]
+ ... )
+ ['"1"," 4600 Silver Hill Rd, Washington, DC, 20233","Match","Exact","4600 SILVER HILL RD, WASHINGTON, DC, 20233","-76.92743,38.84599","76355984","L","24","033","802405","2004"', '"2"," 935 Pennsylvania Avenue, NW, Washington, DC","No_Match"', '"3"," 1600 Pennsylvania Avenue NW, Washington, DC, 20500","Match","Exact","1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500","-77.03534,38.898754","76225813","L","11","001","980000","1034"', '"4"," 700 Pennsylvania Avenue NW, Washington, DC, 20408","Match","Exact","700 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20408","-77.02304,38.89362","76226346","L","11","001","980000","1025"']
+ """
+
+ # TODO: use list_utils.shard to break up the input if it's >10k records
+ # b/c the census website has a hard limit at 10k.
+
+ url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch'
+ payload = {'benchmark': '4', 'vintage': '4'}
+ raw_file = ''
+ for n, address in enumerate(addresses):
+ raw_file += f'{n+1}, {address}\n'
+ files = {'addressFile': ('input.csv', raw_file)}
+ logger.debug('POST: %s', url)
+ try:
+ r = requests.post(url, files=files, data=payload)
+ except Exception as e:
+ logger.exception(e)
+ return None
+ if r.status_code != 200:
+ print(r.text)
+ logger.error('Unexpected response code %d, wanted 200. Fail.', r.status_code)
+ return None
+
+ out = []
+ for line in r.text.split('\n'):
+ line = line.strip()
+ if len(line) > 0:
+ out.append(line)
return out