Cleanup geocode.
[python_utils.git] / geocode.py
1 #!/usr/bin/env python3
2
3 """Wrapper around US Census address geocoder API described here:
4 https://www2.census.gov/geo/pdfs/maps-data/data/Census_Geocoder_User_Guide.pdf"""
5
6 import logging
7 import re
8 from typing import Dict, Optional
9
10 import requests
11 from bs4 import BeautifulSoup
12 from requests.utils import requote_uri
13
14 logger = logging.getLogger(__name__)
15
16
17 def geocode_address(address: str) -> Optional[Dict[str, str]]:
18     """Send a single address to the US Census geocoding API.
19
20     >>> out = geocode_address('5 Shelbern Dr,,, 07738')
21     >>> out['Matched Address']
22     '5 SHELBERN DR, LINCROFT, NJ, 07738'
23
24     """
25     url = 'https://geocoding.geo.census.gov/geocoder/geographies/onelineaddress'
26     url += f'?address={address}'
27     url += '&layers=all&benchmark=4&vintage=4'
28     url = requote_uri(url)
29     logger.debug('GET: %s', url)
30     r = requests.get(url)
31     if r.status_code != 200:
32         logger.error('Unexpected response code %d, wanted 200.  Fail.', r.status_code)
33         return None
34     else:
35         soup = BeautifulSoup(r.text, 'html.parser')
36         result = soup.find('div', id='pl_gov_census_geo_geocoder_domain_AddressResult')
37         logger.debug('Unhelpful result blurb: "%s"', result)
38         output = result.get_text('\n')
39         label = None
40         out = {}
41         for line in output.split('\n'):
42             if re.match(r'.*: *$', line):
43                 line = line.strip()
44                 label = line[:-1]
45                 logger.debug('Label is: "%s"', label)
46             else:
47                 if label:
48                     out[label] = line.strip()
49                     logger.debug('Value is: "%s"', out[label])
50     return out
51
52
53 if __name__ == '__main__':
54     import doctest
55
56     doctest.testmod()