geocode.py

   1 #!/usr/bin/env python3
   2
   3 # © Copyright 2022, Scott Gasch
   4
   5 """Wrapper around US Census address geocoder API described here:
   6 https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf
   7 https://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf
   8 """
   9
  10 import logging
  11 from typing import Any, Dict, List, Optional
  12
  13 import requests
  14 from requests.utils import requote_uri
  15
  16 logger = logging.getLogger(__name__)
  17
  18
  19 def geocode_address(address: str) -> Optional[Dict[str, Any]]:
  20     """Send a single address to the US Census geocoding API.  The response
  21     is a parsed JSON chunk of data with N addressMatches in the result
  22     section and the details of each match within it.  Returns None on error.
  23
  24     >>> json = geocode_address('4600 Silver Hill Rd,, 20233')
  25     >>> json['result']['addressMatches'][0]['matchedAddress']
  26     '4600 SILVER HILL RD, WASHINGTON, DC, 20233'
  27
  28     >>> json['result']['addressMatches'][0]['coordinates']
  29     {'x': -76.92743, 'y': 38.84599}
  30
  31     """
  32     url = 'https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress'
  33     url += f'?address={address}'
  34     url += '&returntype=geographies&layers=all&benchmark=4&vintage=4&format=json'
  35     url = requote_uri(url)
  36     logger.debug('GET: %s', url)
  37     try:
  38         r = requests.get(url)
  39     except Exception as e:
  40         logger.exception(e)
  41         return None
  42
  43     if r.status_code != 200:
  44         logger.error('Unexpected response code %d, wanted 200.  Fail.', r.status_code)
  45         return None
  46     # print(json.dumps(r.json(), indent=4, sort_keys=True))
  47     return r.json()
  48
  49
  50 def batch_geocode_addresses(addresses: List[str]):
  51     """Send up to addresses for batch geocoding.  Each line of the input
  52     list should be a single address of the form: STREET ADDRESS, CITY,
  53     STATE, ZIP.  Components may be omitted but the commas may not be.
  54     Result is an array of the same size as the input array with one
  55     answer record per line.  Returns None on error.
  56
  57     >>> batch_geocode_addresses(
  58     ...     [
  59     ...         '4600 Silver Hill Rd, Washington, DC, 20233',
  60     ...         '935 Pennsylvania Avenue, NW, Washington, DC, 20535-0001',
  61     ...         '1600 Pennsylvania Avenue NW, Washington, DC, 20500',
  62     ...         '700 Pennsylvania Avenue NW, Washington, DC, 20408',
  63     ...     ]
  64     ... )
  65     ['"1"," 4600 Silver Hill Rd,  Washington,  DC,  20233","Match","Exact","4600 SILVER HILL RD, WASHINGTON, DC, 20233","-76.92743,38.84599","76355984","L","24","033","802405","2004"', '"2"," 935 Pennsylvania Avenue,  NW,  Washington,  DC","No_Match"', '"3"," 1600 Pennsylvania Avenue NW,  Washington,  DC,  20500","Match","Exact","1600 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20500","-77.03534,38.898754","76225813","L","11","001","980000","1034"', '"4"," 700 Pennsylvania Avenue NW,  Washington,  DC,  20408","Match","Exact","700 PENNSYLVANIA AVE NW, WASHINGTON, DC, 20408","-77.02304,38.89362","76226346","L","11","001","980000","1025"']
  66     """
  67
  68     # TODO: use list_utils.shard to break up the input if it's >10k records
  69     # b/c the census website has a hard limit at 10k.
  70
  71     url = 'https://geocoding.geo.census.gov/geocoder/geographies/addressbatch'
  72     payload = {'benchmark': '4', 'vintage': '4'}
  73     raw_file = ''
  74     for n, address in enumerate(addresses):
  75         raw_file += f'{n+1}, {address}\n'
  76     files = {'addressFile': ('input.csv', raw_file)}
  77     logger.debug('POST: %s', url)
  78     try:
  79         r = requests.post(url, files=files, data=payload)
  80     except Exception as e:
  81         logger.exception(e)
  82         return None
  83     if r.status_code != 200:
  84         print(r.text)
  85         logger.error('Unexpected response code %d, wanted 200.  Fail.', r.status_code)
  86         return None
  87
  88     out = []
  89     for line in r.text.split('\n'):
  90         line = line.strip()
  91         if len(line) > 0:
  92             out.append(line)
  93     return out
  94
  95
  96 if __name__ == '__main__':
  97     import doctest
  98
  99     doctest.testmod()